Esempio n. 1
0
    def get_company(self, company_id):
        """Scrapes all titles a company is credited for on IMDb.

        Will scrape all titles listed under a company on IMDb by going through each page
        in IMDb's `company search`. This only gives the year(s) the company was involved with
        each title and `notes` for each listed on IMDb.

        Args:
            company_id (:obj:`str`): The company's ID used by IMDb prefixed with `co`.

        Yields:
            :class:`~.models.company.CompanyScrape`: An object for each title the company is credited for.

        Raises:
            HTTPError: If a request failed.
            InvalidCompanyId: If an invalid company ID was given.
        """

        index = 1
        finding_titles = True
        while finding_titles:
            request = f'https://www.imdb.com/search/title/?companies={company_id}&view=simple&start={index}'
            try:
                tree = self._get_tree(request)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:
                    finding_titles = False
                else:
                    raise e
            # Check if this was a valid company ID
            company_title_node = tree.css_first('div.article > h1.header')
            if company_title_node:
                company_title = company_title_node.text().replace(
                    '(Sorted by Popularity Ascending)', '').strip()
                if len(company_title) == 0:
                    raise InvalidCompanyId(f'Invalid company ID: {company_id}')

            title_list_node = tree.css_first('div.lister-list')
            if not title_list_node:
                finding_titles = False
            else:
                for title_info_node in title_list_node.css(
                        'span.lister-item-header'):
                    title_id = None
                    start_year = None
                    end_year = None
                    notes = None

                    year_info_node = None
                    # Check if this is a TV episode
                    episode_node = title_info_node.css_first('small')
                    if episode_node and 'Episode' in episode_node.text():
                        episode_link_node = title_info_node.css_first(
                            'small ~ a')
                        title_id = get_title_id(episode_link_node)
                        year_info_node = title_info_node.css_first(
                            'small ~ a ~ span.lister-item-year')
                    else:
                        title_info_node = title_info_node.css_first(
                            'span.lister-item-index ~ span')
                        if title_info_node:
                            title_link_node = title_info_node.css_first('a')
                            title_id = get_title_id(title_link_node)
                            year_info_node = title_info_node.css_first(
                                'span.lister-item-year')

                    if year_info_node:
                        year_info_text = year_info_node.text().strip('()')
                        years_match = re.search(r'(\d|–|-)+', year_info_text)
                        notes_match = re.search(r'([A-Za-z]+\s*)+',
                                                year_info_text)
                        if years_match:
                            year_info = re.sub(
                                r'[–\-]+', '\t',
                                years_match.group(0)).split('\t')
                            if len(year_info) > 1:
                                start_year, end_year = year_info
                                # Handle shows that are still on-air (ex: '2005- ')
                                if len(end_year.strip()) == 0:
                                    end_year = None
                            else:
                                start_year, = year_info
                        if notes_match:
                            notes = notes_match.group(0)

                    yield CompanyScrape(company_id=company_id,
                                        title_id=title_id,
                                        start_year=start_year,
                                        end_year=end_year,
                                        notes=notes)
            index += 50
Esempio n. 2
0
    def get_title(self, title_id, include_taglines=False):
        """Scrapes information from the IMDb web page for the specified title.

        Uses the given title ID to request the IMDb page for the title and scrapes
        the page's information into a new `TitleScrape` object. An optional argument
        `include_taglines` allows an additional request to be made to gather all
        taglines IMDb has for the title.

        Args:
            title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`.
            include_taglines (:obj:`bool`, optional): Specify if an extra request should be
                made to get all the taglines for the title

        Returns:
            :class:`~.models.title.TitleScrape`: An object containing the page's information.

        Raises:
            HTTPError: If the request failed.
        """

        request = f'https://www.imdb.com/title/{title_id}/'
        tree = self._get_tree(request)

        display_title = None
        title_parent_id = None
        rating = None
        country = None
        language = None
        release_date = None
        end_year = None
        season_number = None
        episode_number = None
        taglines = []
        plot = None
        storyline = None
        production_companies = []
        top_cast = []
        budget = None
        budget_denomination = None
        opening_weekend_gross = None
        opening_weekend_date = None
        usa_gross = None
        worldwide_gross = None

        # Get title text
        title_node = tree.css_first('div.title_wrapper')
        if title_node:
            display_title_node = title_node.css_first('h1')
            if display_title_node:
                # Remove title year
                title_year_node = display_title_node.css_first(
                    'span#titleYear')
                if title_year_node:
                    title_year_node.decompose()
                display_title = display_title_node.text().strip()
            title_info_node = title_node.css_first('div.subtext')
            if title_info_node:
                # If this is a TV series, get the year the show ended
                for link_node in title_info_node.css('a'):
                    if 'href' in link_node.attributes and 'releaseinfo' in link_node.attributes[
                            'href']:
                        series_dates_match = re.search(r'[\d]{4}[-–][\d]{4}',
                                                       link_node.text())
                        if series_dates_match:
                            end_year_split = re.sub(
                                r'[-–]', '\t',
                                series_dates_match.group(0)).split('\t')
                            if len(end_year_split) > 1:
                                end_year = end_year_split[1]
                                break

                # Get MPAA Rating
                title_info_node.strip_tags(['span', 'a', 'time'])
                rating = re.sub(r'(\s|,)*', '', title_info_node.text()).strip()

        # Get title parent (if TV episode)
        title_parent_node = tree.css_first('div.titleParent > a')
        if title_parent_node:
            title_parent_id = get_title_id(title_parent_node)

        # Get plot
        plot_node = tree.css_first('div.summary_text')
        if plot_node:
            plot = plot_node.text().strip()

        # Get storyline
        storyline_node = tree.css_first('div#titleStoryLine')
        if storyline_node:
            storyline_node = storyline_node.css_first('div > p > span')
            if storyline_node:
                storyline = storyline_node.text().strip()

        # Get taglines
        if include_taglines:
            tagline_request = f'https://www.imdb.com/title/{title_id}/taglines'
            tagline_tree = self._get_tree(tagline_request)
            if not tagline_tree.css_first('div#no_content'):
                for tagline_node in tagline_tree.css('div.soda'):
                    # TODO: should a Tagline object be created that stores the note for each tagline separately?
                    taglines.append(tagline_node.text().strip())

        # Parse through text blocks
        text_block_nodes = tree.css('div#titleDetails > div.txt-block')
        for text_block_node in text_block_nodes:
            text_block_id = text_block_node.css_first('h4.inline')
            if text_block_id:
                text_block_id = text_block_id.text().lower().strip()
                text_block_text = text_block_node.text()
                if 'country' in text_block_id:
                    country_node = text_block_node.css_first('a')
                    if country_node:
                        country = country_node.text().strip()
                elif 'language' in text_block_id:
                    language_node = text_block_node.css_first('a')
                    if language_node:
                        language = language_node.text().strip()
                elif 'release date' in text_block_id:
                    release_date_match = re.search(r'\d+?\s*\w+?\s*[\d]{4}',
                                                   text_block_text)
                    if release_date_match:
                        release_date = release_date_match.group(0)
                elif 'production co' in text_block_id:
                    companies = text_block_node.css('a')
                    for company in companies:
                        company_id = get_company_id(company)
                        if company_id:
                            production_companies.append(company_id)
                # Box office info
                elif 'budget' in text_block_id:
                    if is_money_string(text_block_text):
                        budget = trim_money_string(text_block_text)
                        budget_denomination = get_denomination(text_block_text)
                elif 'opening weekend' in text_block_id:
                    if is_money_string(text_block_text):
                        opening_weekend_gross = trim_money_string(
                            text_block_text)
                    opening_weekend_date_node = text_block_node.css_first(
                        'span')
                    if opening_weekend_date_node:
                        opening_weekend_date = opening_weekend_date_node.text(
                        ).strip()
                elif 'gross usa' in text_block_id:
                    if is_money_string(text_block_text):
                        usa_gross = trim_money_string(text_block_text)
                elif 'worldwide gross' in text_block_id:
                    if is_money_string(text_block_text):
                        worldwide_gross = trim_money_string(text_block_text)

        # Get top cast members
        cast_node = tree.css_first('table.cast_list')
        if cast_node:
            for cast_member in cast_node.css('tr.odd, tr.even'):
                cast_member_node = cast_member.css_first(
                    'td:nth-of-type(2) > a')
                if cast_member_node:
                    character_credit = None
                    episode_count = None
                    episode_year_start = None
                    episode_year_end = None
                    character_node = cast_member.css_first('td.character')
                    if character_node:
                        # Check if there is episode information, save it, then remove it
                        episode_info_node = character_node.css_first(
                            'a.toggle-episodes')
                        if episode_info_node:
                            episode_count, episode_year_start, episode_year_end = get_episode_info(
                                episode_info_node)
                            episode_info_node.decompose()
                        character_credit = re.sub(
                            r'\s+', ' ',
                            character_node.text().strip())
                    top_cast.append(
                        CreditScrape(name_id=get_name_id(cast_member_node),
                                     title_id=title_id,
                                     job_title=ACTOR,
                                     credit=character_credit,
                                     episode_count=episode_count,
                                     episode_year_start=episode_year_start,
                                     episode_year_end=episode_year_end))

        # Get season and episode numbers if TV episode
        heading_nodes = tree.css('div.bp_heading')
        for heading_node in heading_nodes:
            if 'Season' in heading_node.text():
                heading_node_text = heading_node.text().lower()
                season_number_match = re.search(r'season\s*\d+',
                                                heading_node_text)
                if season_number_match:
                    season_number_match = re.search(
                        r'\d+', season_number_match.group(0))
                    if season_number_match:
                        season_number = season_number_match.group(0)
                episode_number_match = re.search(r'episode\s*\d+',
                                                 heading_node_text)
                if episode_number_match:
                    episode_number_match = re.search(
                        r'\d+', episode_number_match.group(0))
                    if episode_number_match:
                        episode_number = episode_number_match.group(0)

        return TitleScrape(title_id=title_id,
                           display_title=display_title,
                           title_parent_id=title_parent_id,
                           mpaa_rating=rating,
                           country=country,
                           language=language,
                           release_date=release_date,
                           end_year=end_year,
                           season_number=season_number,
                           episode_number=episode_number,
                           taglines=taglines,
                           plot=plot,
                           storyline=storyline,
                           production_companies=production_companies,
                           top_cast=top_cast,
                           budget=budget,
                           budget_denomination=budget_denomination,
                           opening_weekend_gross=opening_weekend_gross,
                           opening_weekend_date=opening_weekend_date,
                           usa_gross=usa_gross,
                           worldwide_gross=worldwide_gross)
Esempio n. 3
0
    def get_name(self, name_id, include_known_for_titles=False):
        """Scrapes detailed information from a person's personal IMDb web page.

        Will scrape detailed information on a person's IMDb `bio` page into a new
        `NameScrape` object.

        Args:
            name_id (:obj:`str`): The person's ID used by IMDb prefixed with `nm`.
            include_known_for_titles (:obj:`bool`, optional): Determines if an second request should
                be sent to get the known for titles on a person's default IMDb page.

        Returns:
            :class:`~.models.name.NameScrape`: An object with the person's information.

        Raises:
            HTTPError: If the request failed.
        """
        request = f'https://www.imdb.com/name/{name_id}/bio'
        tree = self._get_tree(request)

        display_name = None
        known_for_titles = []
        birth_date = None
        birth_city = None
        death_date = None
        death_city = None
        death_cause = None
        birth_name = None
        nicknames = []
        height = None

        display_name_node = tree.css_first(
            'div#main > div:nth-of-type(1) > div:nth-of-type(1) > div > h3 > a'
        )
        if display_name_node:
            display_name = display_name_node.text().strip()

        bio_node = tree.css_first('div#bio_content')
        if bio_node:
            overview_node = bio_node.css_first('table#overviewTable')
            if overview_node:
                for row_node in overview_node.css('tr'):
                    label_node = row_node.css_first('td.label')
                    if label_node:
                        label = label_node.text().lower().strip()
                        if label == 'born':
                            birth_date_node = row_node.css_first('td > time')
                            if birth_date_node and 'datetime' in birth_date_node.attributes:
                                birth_date = birth_date_node.attributes[
                                    'datetime']
                            birth_city_node = row_node.css_first('td > a')
                            if birth_city_node:
                                birth_city = birth_city_node.text().strip()
                        elif label == 'died':
                            death_date_node = row_node.css_first('td > time')
                            if death_date_node and 'datetime' in death_date_node.attributes:
                                death_date = death_date_node.attributes[
                                    'datetime']
                            death_city_node = row_node.css_first('td > a')
                            if death_city_node:
                                death_city = death_city_node.text().strip()
                            death_cause_node = row_node.css_first('td ~ td')
                            if death_cause_node:
                                death_cause_match = re.search(
                                    r'\(.*\)', death_cause_node.text())
                                if death_cause_match:
                                    death_cause = death_cause_match.group(
                                        0).strip('()')
                        elif label == 'birth name':
                            birth_name_node = row_node.css_first('td ~ td')
                            if birth_name_node:
                                birth_name = birth_name_node.text().strip()
                        elif label == 'nicknames':
                            nicknames_node = row_node.css_first('td ~ td')
                            if nicknames_node:
                                nicknames = split_by_br(
                                    re.sub(r'</*td>', '',
                                           nicknames_node.html).strip())
                        elif label == 'height':
                            height_node = row_node.css_first('td ~ td')
                            if height_node:
                                height_match = re.search(
                                    r'\(\d+\.*\d*',
                                    height_node.text().strip())
                                if height_match:
                                    height = height_match.group(0).strip('(')
        if include_known_for_titles:
            known_for_titles_request = f'https://www.imdb.com/name/{name_id}/'
            known_for_titles_tree = self._get_tree(known_for_titles_request)
            known_for_titles_node = known_for_titles_tree.css_first(
                '#knownfor, #knownfor-stacked')
            if known_for_titles_node:
                for known_for_title_node in known_for_titles_node.css(
                        '.knownfor-title'):
                    known_for_title_id = get_title_id(
                        known_for_title_node.css_first('a'))
                    if known_for_title_id:
                        known_for_titles.append(known_for_title_id)

        return NameScrape(name_id=name_id,
                          display_name=display_name,
                          known_for_titles=known_for_titles,
                          birth_name=birth_name,
                          birth_date=birth_date,
                          birth_city=birth_city,
                          death_date=death_date,
                          death_city=death_city,
                          death_cause=death_cause,
                          nicknames=nicknames,
                          height=height)
Esempio n. 4
0
    def get_name_credits(self, name_id, include_episodes=False):
        """Scrapes all title credits a person is included in.

        Scrapes the `full filmography` from a person's IMDb page to get each
        title they are credited in, and what category that credit is under.
        An optional argument `include_episodes` will also scrape each episode
        an actor is in if the title is a TV series. Each credit is created
        with a new `NameCreditScrape` object.

        Args:
            name_id (:obj:`str`): The person's ID used by IMDb prefixed with `nm`.
            include_episodes (:obj:`bool`, optional): Specify if individual episodes of a TV series
                should also be scraped.

        Yields: 
            :class:`~.models.name.NameCreditScrape`: An object for each credit in the person's filmography.

        Raises:
            HTTPError: If a request failed.
        """

        request = f'https://www.imdb.com/name/{name_id}/'
        tree = self._get_tree(request)

        filmography_node = tree.css_first('div#filmography')
        if not filmography_node:
            return None

        for row_node in filmography_node.css('div.filmo-row'):
            category, title_id = row_node.id.split('-')
            category = '_'.join(category.split()).lower()
            start_year = None
            end_year = None
            title_info = None
            role = None
            years_node = row_node.css_first('span.year_column')
            if years_node:
                years = years_node.text().strip()
                if len(years) > 0:
                    if '-' in years:
                        start_year, end_year = years.split('-')
                    else:
                        start_year = years
            info = split_by_br(row_node.html)
            if len(info) > 1:
                title_info, role = info
                role = re.sub(r'<.*?>', '',
                              remove_tags_and_content(role, 'div')).strip()
                if include_episodes and row_node.css_first(
                        'div.filmo-episodes'):
                    # Send AJAX request if a "show all" link exists
                    more_episodes_node = row_node.css_first(
                        f'div#more-episodes-{title_id}-{category} ~ div.filmo-episodes'
                    )
                    episode_nodes = row_node
                    if more_episodes_node:
                        onclick_node = more_episodes_node.css_first('div > a')
                        ref_marker = get_ref_marker(onclick_node)
                        category_req = get_category(onclick_node)
                        request = f'https://www.imdb.com/name/{name_id}/episodes/_ajax?title={title_id}' + \
                                  f'&category={category_req}&ref_marker={ref_marker}&start_index=0'
                        try:
                            episode_nodes = self._get_tree(request)
                        except requests.exceptions.HTTPError as e:
                            # Some AJAX calls seem to 404, so ignore them and remove the "show all" link
                            if e.response.status_code == 404:
                                more_episodes_node.decompose()
                            else:
                                raise e

                    episode_nodes = episode_nodes.css('div.filmo-episodes')
                    for episode_node in episode_nodes:
                        episode_info_node = episode_node.css_first('a')
                        episode_id = None
                        if episode_info_node:
                            episode_id = get_title_id(episode_info_node)
                        episode_info = episode_node.text().split('...')
                        episode_year = None
                        episode_role = None
                        if len(episode_info) > 1:
                            year_info = episode_info[0]
                            episode_role = '...'.join(episode_info[1:]).strip()
                            if len(episode_role) == 0:
                                episode_role = None
                        else:
                            year_info, = episode_info
                        year_info_match = re.search(r'\([\d]{4}\)', year_info)
                        if year_info_match:
                            episode_year = year_info_match.group(0).strip('()')

                        yield NameCreditScrape(name_id=name_id,
                                               title_id=episode_id,
                                               category=category,
                                               start_year=episode_year,
                                               end_year=None,
                                               role=episode_role,
                                               title_notes=[])
            else:
                title_info, = info
            title_info = re.sub(r'(<\s*a.*?>|<.*?a\s*>)', '', title_info)
            title_notes = [
                note.strip('()')
                for note in re.findall(r'\(.*?\)', title_info)
            ]
            if role is not None and len(role) == 0:
                role = None

            yield NameCreditScrape(name_id=name_id,
                                   title_id=title_id,
                                   category=category,
                                   start_year=trim_year(start_year),
                                   end_year=trim_year(end_year),
                                   role=role,
                                   title_notes=title_notes)
Esempio n. 5
0
    def get_full_cast(self, title_id, include_episodes=False):
        """Scrapes the full cast of actors for a specified title.

        Will scrape the full cast of actors for a title, each into their own `CreditScrape` object.
        An optional argument `include_episodes` will also scrape each episode an actor is in
        if the title is a TV series.

        Args:
            title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`.
            include_episodes (:obj:`bool`, optional): Specify if individual episodes of a 
                TV series should also be scraped.

        Yields:
            :class:`~.models.title.CreditScrape`: An object for each cast member in the title.

        Raises:
            HTTPError: If a request failed.
        """

        request = f'https://www.imdb.com/title/{title_id}/fullcredits'
        tree = self._get_tree(request)

        cast_node = tree.css_first('table.cast_list').css('tr')
        for cast_member in cast_node:
            actor_node = cast_member.css_first('td.primary_photo + td > a')
            if actor_node:
                name_id = get_name_id(actor_node)
                credit = None
                episode_count = None
                episode_year_start = None
                episode_year_end = None

                # Check if this is a TV series
                toggle_episodes_node = cast_member.css_first(
                    'a.toggle-episodes')
                if toggle_episodes_node:
                    episode_count, episode_year_start, episode_year_end = get_episode_info(
                        toggle_episodes_node)

                    # Include all individual episodes an actor is in
                    if include_episodes:
                        ref_marker = get_ref_marker(toggle_episodes_node)
                        request = f'https://www.imdb.com/name/{name_id}/episodes/_ajax?title={title_id}' + \
                                  f'&category=actor&ref_marker={ref_marker}&start_index=0'
                        episodes_tree = self._get_tree(request)

                        episode_nodes = episodes_tree.css('div.filmo-episodes')
                        for episode_node in episode_nodes:
                            episode_id = get_title_id(
                                episode_node.css_first('a'))
                            episode_year = None
                            episode_credit = None

                            episode_info = episode_node.text().strip().split(
                                '...')
                            if len(episode_info) > 1:
                                episode_year_info = episode_info[0]
                                episode_credit = '...'.join(
                                    episode_info[1:]).strip()
                            else:
                                episode_year_info, = episode_info

                            episode_year_match = re.search(
                                r'\([\d]{4}\)', episode_year_info)
                            if episode_year_match:
                                episode_year = episode_year_match.group(
                                    0).strip('()')

                            yield CreditScrape(name_id=name_id,
                                               title_id=episode_id,
                                               job_title=ACTOR,
                                               credit=episode_credit,
                                               episode_count=None,
                                               episode_year_start=episode_year,
                                               episode_year_end=None)

                # Remove the TV series info from character node if exists
                if toggle_episodes_node:
                    toggle_episodes_node.decompose()

                # Get the actor's credits
                character_node = cast_member.css_first('td.character')
                if character_node:
                    credit = re.sub(r'(\s|\r|\n)+', ' ',
                                    character_node.text().strip())

                yield CreditScrape(name_id=name_id,
                                   title_id=title_id,
                                   job_title=ACTOR,
                                   credit=credit,
                                   episode_count=episode_count,
                                   episode_year_start=episode_year_start,
                                   episode_year_end=episode_year_end)