def get_char(href): src = get("{}{}".format(base, href)).content soup = BeautifulSoup(src) name = soup.select("#WikiaPageHeader")[0].h1.text node = Node(name, "Character", "name", {"name": name}) paths = PathList() sys.stderr.write(name + "\n") try: sidebar = soup.select(".sidebar")[0] except IndexError: # No sidebar - probably not a named character raise ValueError(name) else: def get_sidebar_value(adjacent_text, default=None): try: return sidebar.find_all(lambda tag: tag.name == "td" and adjacent_text in tag.text)[ 0 ].next_sibling.text.strip() except IndexError: return default node.properties["name"] = get_sidebar_value("Name", name) if node.properties["name"].endswith(")"): node.properties["name"] = node.properties["name"].rpartition("(")[0].strip() nationality = get_sidebar_value("Nationality") if nationality is not None: node.properties["nationality"] = nationality node.properties["alive"] = not ("Deceased characters" in soup.text) if "Killed by Jack Bauer" in soup.text: paths.append(Path("jack_bauer", "KILLED!", name)) if "Killed by Tony Almeida" in soup.text: paths.append(Path("tony_almeida", "KILLED!", name)) if "Killed by Chase Edmunds" in soup.text: paths.append(Path("chase_edmunds", "KILLED!", name)) if "Killed by Curtis Manning" in soup.text: paths.append(Path("curtis_manning", "KILLED!", name)) if "Killed by Mandy" in soup.text: paths.append(Path("mandy", "KILLED!", name)) if "Killed by Nina Myers" in soup.text: paths.append(Path("nina_myers", "KILLED!", name)) if "Killed by Renee Walker" in soup.text: paths.append(Path("renee_walker", "KILLED!", name)) return node, paths
def iter_cast(season_node): season_number = season_node.properties["number"] src = get("{}/wiki/Season_{}".format(base, season_number)).content soup = BeautifulSoup(src) cast_heading = soup.select("#Cast")[0].parent cast_lists = cast_heading.find_next_siblings("ul") for cast_list in cast_lists: for cast_item in cast_list.children: terms = list(cast_item.children) found_as = False for term in cast_item.children: if isinstance(term, str) and term.startswith(" as "): found_as = True break if not found_as: continue links = cast_item.select("a") try: actor_a = links[0] actor_node, actor_paths = get_actor(actor_a.text) char_a = links[-1] char_node, char_paths = get_char(char_a["href"]) except ValueError: pass else: paths = PathList() trailing_words = terms[-1].strip().strip("()").split(",") first_bits = [x.partition("episode")[0].strip() for x in trailing_words if "episode" in x] episodes = int(first_bits[0]) paths.append( Path( actor_node.name, "STARRED_AS!", char_node.name, ("APPEARED_IN!", {"episodes": episodes}), season_node.name, ) ) paths.extend(actor_paths) paths.extend(char_paths) yield actor_node, char_node, paths