def get_company(self, company_id): """Scrapes all titles a company is credited for on IMDb. Will scrape all titles listed under a company on IMDb by going through each page in IMDb's `company search`. This only gives the year(s) the company was involved with each title and `notes` for each listed on IMDb. Args: company_id (:obj:`str`): The company's ID used by IMDb prefixed with `co`. Yields: :class:`~.models.company.CompanyScrape`: An object for each title the company is credited for. Raises: HTTPError: If a request failed. InvalidCompanyId: If an invalid company ID was given. """ index = 1 finding_titles = True while finding_titles: request = f'https://www.imdb.com/search/title/?companies={company_id}&view=simple&start={index}' try: tree = self._get_tree(request) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: finding_titles = False else: raise e # Check if this was a valid company ID company_title_node = tree.css_first('div.article > h1.header') if company_title_node: company_title = company_title_node.text().replace( '(Sorted by Popularity Ascending)', '').strip() if len(company_title) == 0: raise InvalidCompanyId(f'Invalid company ID: {company_id}') title_list_node = tree.css_first('div.lister-list') if not title_list_node: finding_titles = False else: for title_info_node in title_list_node.css( 'span.lister-item-header'): title_id = None start_year = None end_year = None notes = None year_info_node = None # Check if this is a TV episode episode_node = title_info_node.css_first('small') if episode_node and 'Episode' in episode_node.text(): episode_link_node = title_info_node.css_first( 'small ~ a') title_id = get_title_id(episode_link_node) year_info_node = title_info_node.css_first( 'small ~ a ~ span.lister-item-year') else: title_info_node = title_info_node.css_first( 'span.lister-item-index ~ span') if title_info_node: title_link_node = title_info_node.css_first('a') title_id = get_title_id(title_link_node) year_info_node = title_info_node.css_first( 'span.lister-item-year') if year_info_node: year_info_text = year_info_node.text().strip('()') years_match = re.search(r'(\d|–|-)+', year_info_text) notes_match = re.search(r'([A-Za-z]+\s*)+', year_info_text) if years_match: year_info = re.sub( r'[–\-]+', '\t', years_match.group(0)).split('\t') if len(year_info) > 1: start_year, end_year = year_info # Handle shows that are still on-air (ex: '2005- ') if len(end_year.strip()) == 0: end_year = None else: start_year, = year_info if notes_match: notes = notes_match.group(0) yield CompanyScrape(company_id=company_id, title_id=title_id, start_year=start_year, end_year=end_year, notes=notes) index += 50
def get_title(self, title_id, include_taglines=False): """Scrapes information from the IMDb web page for the specified title. Uses the given title ID to request the IMDb page for the title and scrapes the page's information into a new `TitleScrape` object. An optional argument `include_taglines` allows an additional request to be made to gather all taglines IMDb has for the title. Args: title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`. include_taglines (:obj:`bool`, optional): Specify if an extra request should be made to get all the taglines for the title Returns: :class:`~.models.title.TitleScrape`: An object containing the page's information. Raises: HTTPError: If the request failed. """ request = f'https://www.imdb.com/title/{title_id}/' tree = self._get_tree(request) display_title = None title_parent_id = None rating = None country = None language = None release_date = None end_year = None season_number = None episode_number = None taglines = [] plot = None storyline = None production_companies = [] top_cast = [] budget = None budget_denomination = None opening_weekend_gross = None opening_weekend_date = None usa_gross = None worldwide_gross = None # Get title text title_node = tree.css_first('div.title_wrapper') if title_node: display_title_node = title_node.css_first('h1') if display_title_node: # Remove title year title_year_node = display_title_node.css_first( 'span#titleYear') if title_year_node: title_year_node.decompose() display_title = display_title_node.text().strip() title_info_node = title_node.css_first('div.subtext') if title_info_node: # If this is a TV series, get the year the show ended for link_node in title_info_node.css('a'): if 'href' in link_node.attributes and 'releaseinfo' in link_node.attributes[ 'href']: series_dates_match = re.search(r'[\d]{4}[-–][\d]{4}', link_node.text()) if series_dates_match: end_year_split = re.sub( r'[-–]', '\t', series_dates_match.group(0)).split('\t') if len(end_year_split) > 1: end_year = end_year_split[1] break # Get MPAA Rating title_info_node.strip_tags(['span', 'a', 'time']) rating = re.sub(r'(\s|,)*', '', title_info_node.text()).strip() # Get title parent (if TV episode) title_parent_node = tree.css_first('div.titleParent > a') if title_parent_node: title_parent_id = get_title_id(title_parent_node) # Get plot plot_node = tree.css_first('div.summary_text') if plot_node: plot = plot_node.text().strip() # Get storyline storyline_node = tree.css_first('div#titleStoryLine') if storyline_node: storyline_node = storyline_node.css_first('div > p > span') if storyline_node: storyline = storyline_node.text().strip() # Get taglines if include_taglines: tagline_request = f'https://www.imdb.com/title/{title_id}/taglines' tagline_tree = self._get_tree(tagline_request) if not tagline_tree.css_first('div#no_content'): for tagline_node in tagline_tree.css('div.soda'): # TODO: should a Tagline object be created that stores the note for each tagline separately? taglines.append(tagline_node.text().strip()) # Parse through text blocks text_block_nodes = tree.css('div#titleDetails > div.txt-block') for text_block_node in text_block_nodes: text_block_id = text_block_node.css_first('h4.inline') if text_block_id: text_block_id = text_block_id.text().lower().strip() text_block_text = text_block_node.text() if 'country' in text_block_id: country_node = text_block_node.css_first('a') if country_node: country = country_node.text().strip() elif 'language' in text_block_id: language_node = text_block_node.css_first('a') if language_node: language = language_node.text().strip() elif 'release date' in text_block_id: release_date_match = re.search(r'\d+?\s*\w+?\s*[\d]{4}', text_block_text) if release_date_match: release_date = release_date_match.group(0) elif 'production co' in text_block_id: companies = text_block_node.css('a') for company in companies: company_id = get_company_id(company) if company_id: production_companies.append(company_id) # Box office info elif 'budget' in text_block_id: if is_money_string(text_block_text): budget = trim_money_string(text_block_text) budget_denomination = get_denomination(text_block_text) elif 'opening weekend' in text_block_id: if is_money_string(text_block_text): opening_weekend_gross = trim_money_string( text_block_text) opening_weekend_date_node = text_block_node.css_first( 'span') if opening_weekend_date_node: opening_weekend_date = opening_weekend_date_node.text( ).strip() elif 'gross usa' in text_block_id: if is_money_string(text_block_text): usa_gross = trim_money_string(text_block_text) elif 'worldwide gross' in text_block_id: if is_money_string(text_block_text): worldwide_gross = trim_money_string(text_block_text) # Get top cast members cast_node = tree.css_first('table.cast_list') if cast_node: for cast_member in cast_node.css('tr.odd, tr.even'): cast_member_node = cast_member.css_first( 'td:nth-of-type(2) > a') if cast_member_node: character_credit = None episode_count = None episode_year_start = None episode_year_end = None character_node = cast_member.css_first('td.character') if character_node: # Check if there is episode information, save it, then remove it episode_info_node = character_node.css_first( 'a.toggle-episodes') if episode_info_node: episode_count, episode_year_start, episode_year_end = get_episode_info( episode_info_node) episode_info_node.decompose() character_credit = re.sub( r'\s+', ' ', character_node.text().strip()) top_cast.append( CreditScrape(name_id=get_name_id(cast_member_node), title_id=title_id, job_title=ACTOR, credit=character_credit, episode_count=episode_count, episode_year_start=episode_year_start, episode_year_end=episode_year_end)) # Get season and episode numbers if TV episode heading_nodes = tree.css('div.bp_heading') for heading_node in heading_nodes: if 'Season' in heading_node.text(): heading_node_text = heading_node.text().lower() season_number_match = re.search(r'season\s*\d+', heading_node_text) if season_number_match: season_number_match = re.search( r'\d+', season_number_match.group(0)) if season_number_match: season_number = season_number_match.group(0) episode_number_match = re.search(r'episode\s*\d+', heading_node_text) if episode_number_match: episode_number_match = re.search( r'\d+', episode_number_match.group(0)) if episode_number_match: episode_number = episode_number_match.group(0) return TitleScrape(title_id=title_id, display_title=display_title, title_parent_id=title_parent_id, mpaa_rating=rating, country=country, language=language, release_date=release_date, end_year=end_year, season_number=season_number, episode_number=episode_number, taglines=taglines, plot=plot, storyline=storyline, production_companies=production_companies, top_cast=top_cast, budget=budget, budget_denomination=budget_denomination, opening_weekend_gross=opening_weekend_gross, opening_weekend_date=opening_weekend_date, usa_gross=usa_gross, worldwide_gross=worldwide_gross)
def get_name(self, name_id, include_known_for_titles=False): """Scrapes detailed information from a person's personal IMDb web page. Will scrape detailed information on a person's IMDb `bio` page into a new `NameScrape` object. Args: name_id (:obj:`str`): The person's ID used by IMDb prefixed with `nm`. include_known_for_titles (:obj:`bool`, optional): Determines if an second request should be sent to get the known for titles on a person's default IMDb page. Returns: :class:`~.models.name.NameScrape`: An object with the person's information. Raises: HTTPError: If the request failed. """ request = f'https://www.imdb.com/name/{name_id}/bio' tree = self._get_tree(request) display_name = None known_for_titles = [] birth_date = None birth_city = None death_date = None death_city = None death_cause = None birth_name = None nicknames = [] height = None display_name_node = tree.css_first( 'div#main > div:nth-of-type(1) > div:nth-of-type(1) > div > h3 > a' ) if display_name_node: display_name = display_name_node.text().strip() bio_node = tree.css_first('div#bio_content') if bio_node: overview_node = bio_node.css_first('table#overviewTable') if overview_node: for row_node in overview_node.css('tr'): label_node = row_node.css_first('td.label') if label_node: label = label_node.text().lower().strip() if label == 'born': birth_date_node = row_node.css_first('td > time') if birth_date_node and 'datetime' in birth_date_node.attributes: birth_date = birth_date_node.attributes[ 'datetime'] birth_city_node = row_node.css_first('td > a') if birth_city_node: birth_city = birth_city_node.text().strip() elif label == 'died': death_date_node = row_node.css_first('td > time') if death_date_node and 'datetime' in death_date_node.attributes: death_date = death_date_node.attributes[ 'datetime'] death_city_node = row_node.css_first('td > a') if death_city_node: death_city = death_city_node.text().strip() death_cause_node = row_node.css_first('td ~ td') if death_cause_node: death_cause_match = re.search( r'\(.*\)', death_cause_node.text()) if death_cause_match: death_cause = death_cause_match.group( 0).strip('()') elif label == 'birth name': birth_name_node = row_node.css_first('td ~ td') if birth_name_node: birth_name = birth_name_node.text().strip() elif label == 'nicknames': nicknames_node = row_node.css_first('td ~ td') if nicknames_node: nicknames = split_by_br( re.sub(r'</*td>', '', nicknames_node.html).strip()) elif label == 'height': height_node = row_node.css_first('td ~ td') if height_node: height_match = re.search( r'\(\d+\.*\d*', height_node.text().strip()) if height_match: height = height_match.group(0).strip('(') if include_known_for_titles: known_for_titles_request = f'https://www.imdb.com/name/{name_id}/' known_for_titles_tree = self._get_tree(known_for_titles_request) known_for_titles_node = known_for_titles_tree.css_first( '#knownfor, #knownfor-stacked') if known_for_titles_node: for known_for_title_node in known_for_titles_node.css( '.knownfor-title'): known_for_title_id = get_title_id( known_for_title_node.css_first('a')) if known_for_title_id: known_for_titles.append(known_for_title_id) return NameScrape(name_id=name_id, display_name=display_name, known_for_titles=known_for_titles, birth_name=birth_name, birth_date=birth_date, birth_city=birth_city, death_date=death_date, death_city=death_city, death_cause=death_cause, nicknames=nicknames, height=height)
def get_name_credits(self, name_id, include_episodes=False): """Scrapes all title credits a person is included in. Scrapes the `full filmography` from a person's IMDb page to get each title they are credited in, and what category that credit is under. An optional argument `include_episodes` will also scrape each episode an actor is in if the title is a TV series. Each credit is created with a new `NameCreditScrape` object. Args: name_id (:obj:`str`): The person's ID used by IMDb prefixed with `nm`. include_episodes (:obj:`bool`, optional): Specify if individual episodes of a TV series should also be scraped. Yields: :class:`~.models.name.NameCreditScrape`: An object for each credit in the person's filmography. Raises: HTTPError: If a request failed. """ request = f'https://www.imdb.com/name/{name_id}/' tree = self._get_tree(request) filmography_node = tree.css_first('div#filmography') if not filmography_node: return None for row_node in filmography_node.css('div.filmo-row'): category, title_id = row_node.id.split('-') category = '_'.join(category.split()).lower() start_year = None end_year = None title_info = None role = None years_node = row_node.css_first('span.year_column') if years_node: years = years_node.text().strip() if len(years) > 0: if '-' in years: start_year, end_year = years.split('-') else: start_year = years info = split_by_br(row_node.html) if len(info) > 1: title_info, role = info role = re.sub(r'<.*?>', '', remove_tags_and_content(role, 'div')).strip() if include_episodes and row_node.css_first( 'div.filmo-episodes'): # Send AJAX request if a "show all" link exists more_episodes_node = row_node.css_first( f'div#more-episodes-{title_id}-{category} ~ div.filmo-episodes' ) episode_nodes = row_node if more_episodes_node: onclick_node = more_episodes_node.css_first('div > a') ref_marker = get_ref_marker(onclick_node) category_req = get_category(onclick_node) request = f'https://www.imdb.com/name/{name_id}/episodes/_ajax?title={title_id}' + \ f'&category={category_req}&ref_marker={ref_marker}&start_index=0' try: episode_nodes = self._get_tree(request) except requests.exceptions.HTTPError as e: # Some AJAX calls seem to 404, so ignore them and remove the "show all" link if e.response.status_code == 404: more_episodes_node.decompose() else: raise e episode_nodes = episode_nodes.css('div.filmo-episodes') for episode_node in episode_nodes: episode_info_node = episode_node.css_first('a') episode_id = None if episode_info_node: episode_id = get_title_id(episode_info_node) episode_info = episode_node.text().split('...') episode_year = None episode_role = None if len(episode_info) > 1: year_info = episode_info[0] episode_role = '...'.join(episode_info[1:]).strip() if len(episode_role) == 0: episode_role = None else: year_info, = episode_info year_info_match = re.search(r'\([\d]{4}\)', year_info) if year_info_match: episode_year = year_info_match.group(0).strip('()') yield NameCreditScrape(name_id=name_id, title_id=episode_id, category=category, start_year=episode_year, end_year=None, role=episode_role, title_notes=[]) else: title_info, = info title_info = re.sub(r'(<\s*a.*?>|<.*?a\s*>)', '', title_info) title_notes = [ note.strip('()') for note in re.findall(r'\(.*?\)', title_info) ] if role is not None and len(role) == 0: role = None yield NameCreditScrape(name_id=name_id, title_id=title_id, category=category, start_year=trim_year(start_year), end_year=trim_year(end_year), role=role, title_notes=title_notes)
def get_full_cast(self, title_id, include_episodes=False): """Scrapes the full cast of actors for a specified title. Will scrape the full cast of actors for a title, each into their own `CreditScrape` object. An optional argument `include_episodes` will also scrape each episode an actor is in if the title is a TV series. Args: title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`. include_episodes (:obj:`bool`, optional): Specify if individual episodes of a TV series should also be scraped. Yields: :class:`~.models.title.CreditScrape`: An object for each cast member in the title. Raises: HTTPError: If a request failed. """ request = f'https://www.imdb.com/title/{title_id}/fullcredits' tree = self._get_tree(request) cast_node = tree.css_first('table.cast_list').css('tr') for cast_member in cast_node: actor_node = cast_member.css_first('td.primary_photo + td > a') if actor_node: name_id = get_name_id(actor_node) credit = None episode_count = None episode_year_start = None episode_year_end = None # Check if this is a TV series toggle_episodes_node = cast_member.css_first( 'a.toggle-episodes') if toggle_episodes_node: episode_count, episode_year_start, episode_year_end = get_episode_info( toggle_episodes_node) # Include all individual episodes an actor is in if include_episodes: ref_marker = get_ref_marker(toggle_episodes_node) request = f'https://www.imdb.com/name/{name_id}/episodes/_ajax?title={title_id}' + \ f'&category=actor&ref_marker={ref_marker}&start_index=0' episodes_tree = self._get_tree(request) episode_nodes = episodes_tree.css('div.filmo-episodes') for episode_node in episode_nodes: episode_id = get_title_id( episode_node.css_first('a')) episode_year = None episode_credit = None episode_info = episode_node.text().strip().split( '...') if len(episode_info) > 1: episode_year_info = episode_info[0] episode_credit = '...'.join( episode_info[1:]).strip() else: episode_year_info, = episode_info episode_year_match = re.search( r'\([\d]{4}\)', episode_year_info) if episode_year_match: episode_year = episode_year_match.group( 0).strip('()') yield CreditScrape(name_id=name_id, title_id=episode_id, job_title=ACTOR, credit=episode_credit, episode_count=None, episode_year_start=episode_year, episode_year_end=None) # Remove the TV series info from character node if exists if toggle_episodes_node: toggle_episodes_node.decompose() # Get the actor's credits character_node = cast_member.css_first('td.character') if character_node: credit = re.sub(r'(\s|\r|\n)+', ' ', character_node.text().strip()) yield CreditScrape(name_id=name_id, title_id=title_id, job_title=ACTOR, credit=credit, episode_count=episode_count, episode_year_start=episode_year_start, episode_year_end=episode_year_end)