Exemple #1
0
    def _parse_series_intro(self, series_page_soup: BeautifulSoup,
                            series_blog: JpnArticleBlog) -> None:
        """Parse the intro and catchphrase for a series.

        Both the intro and catchphrase are optional, so a series might not have
        them set.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        catchphrase_tag = series_page_soup.find(id=self._CATCHPHRASE_TAG_ID)
        if catchphrase_tag is not None:
            series_blog.catchphrase = html.parse_valid_child_text(
                catchphrase_tag)

        intro_tag = series_page_soup.find(id=self._INTRO_TAG_ID)
        if intro_tag is not None:
            # Remove the expand button text from the end of the intro
            expand_button_span = intro_tag.find(
                'span', class_=self._INTRO_EXPAND_BUTTON_CLASS)
            if expand_button_span is not None:
                expand_button_span.decompose()

            series_blog.introduction = html.parse_valid_child_text(intro_tag)
Exemple #2
0
    def _parse_body_div(self, tag: Tag) -> Optional[str]:
        """Parse the body text from a division of an NHK article.

        Args:
            tag: Tag containing a division of an NHK article.

        Returns:
            The parsed body text from tag.

        Raises:
            CannotParsePageError: There was an error parsing the body text from
                tag.
        """
        section_text = html.parse_valid_child_text(tag, False)
        if section_text is not None:
            return section_text

        text_sections = []
        for child in tag.children:
            # Skip text around child tags such as '\n'
            if child.name is None:
                continue

            child_text = html.parse_valid_child_text(child, False)
            if child_text is None:
                continue

            if len(child_text) > 0:
                text_sections.append(child_text)

        return '\n'.join(text_sections) if len(text_sections) > 0 else None
Exemple #3
0
    def _parse_episode_text(self, episode_page_soup: BeautifulSoup) -> str:
        """Parse the full text for an episode.

        Args:
            episode_page_soup: A BeautifulSoup initialized with the content
                from an episode page.

        Returns:
            The full text for the episode.
        """
        body_text_list = []
        title = html.parse_text_from_descendant_by_class(
            episode_page_soup, self._EPISODE_TITLE_CLASS, 'p')
        body_text_list.append(title.strip())
        body_text_list.append('')  # Add extra new line after title

        body_text_div = html.select_one_descendant_by_class(
            episode_page_soup, self._EPISODE_TEXT_DIV_CLASS, 'div')
        body_text_paras = html.select_descendants_by_tag(body_text_div, 'p')

        for body_text_para in body_text_paras:
            para_text = html.parse_valid_child_text(body_text_para, False)
            if para_text is None:
                body_text_list.append('')
            else:
                body_text_list.append(para_text)

        return '\n'.join(body_text_list)
Exemple #4
0
    def _parse_series_tags(self, series_page_soup: BeautifulSoup,
                           series_blog: JpnArticleBlog) -> None:
        """Parse the tags for a series.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        series_blog.tags = []
        genre = html.parse_text_from_descendant_by_id(
            series_page_soup, self._SERIES_GENRE_TAG_ID)
        series_blog.tags.append(genre.strip())

        # If a series has no tags set for it, the tag div won't exist on the
        # series page.
        tag_div = series_page_soup.find(id=self._SERIES_TAG_DIV_ID)
        if tag_div is None:
            return

        tag_lists = tag_div.find_all('ul')
        for tag_list in tag_lists:
            for tag_element in tag_list.find_all('li'):
                series_blog.tags.append(
                    html.parse_valid_child_text(tag_element).strip())
Exemple #5
0
    def _parse_series_episode_metadatas(
            self, series_page_soup: BeautifulSoup,
            series_blog: JpnArticleBlog) -> List[JpnArticle]:
        """Parse the episode metadatas for a series from its homepage.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: Blog info for this series.

        Returns:
            A list of the article metadatas for all episodes listed on the
            series homepage.
        """
        table_of_contents_items = self._select_table_of_contents_items(
            series_page_soup)

        article_metas = []
        ep_order_num = 1
        section_order_num = 0
        section_ep_order_num = 1
        section_name = None
        for item in table_of_contents_items:
            if self._is_section_li(item):
                section_order_num += 1
                section_ep_order_num = 1
                section_name = html.parse_valid_child_text(item).strip()
            elif self._is_episode_li(item):
                article_meta = self._parse_table_of_contents_episode(
                    item, series_blog, ep_order_num, section_name,
                    section_order_num, section_ep_order_num)
                article_metas.append(article_meta)
                ep_order_num += 1
                section_ep_order_num += 1
            else:
                utils.log_and_raise(
                    _log, HtmlParsingError,
                    'Unrecognized list item "{}" in table of contents: '
                    '"{}"'.format(item, series_page_soup))

        return article_metas
Exemple #6
0
    def _parse_search_results_page(
            self, page_soup: BeautifulSoup) -> List[JpnArticleBlog]:
        """Parse the series blog info from a search results page.

        Args:
            page_soup: A BeautifulSoup initialized with the content from a
                search page.

        Returns:
            A list of the series blog info for all of the series listed in the
            search results page.
        """
        series_blogs = []
        series_tiles = html.select_descendants_by_class(
            page_soup, self._SEARCH_RESULT_TILE_CLASS, 'div')
        _log.debug('Found %s series on search results page', len(series_tiles))

        for series_tile in series_tiles:
            series_blog = JpnArticleBlog(source_name=self.SOURCE_NAME)

            title_link_tag = html.select_one_descendant_by_class(
                series_tile, self._SEARCH_RESULT_TITLE_CLASS, 'a')
            series_blog.title = html.parse_valid_child_text(
                title_link_tag).strip()
            series_blog.source_url = utils.strip_url_query_and_frag(
                title_link_tag['href'])

            series_blog.author = html.parse_text_from_descendant_by_class(
                series_tile, self._SEARCH_RESULT_AUTHOR_CLASS, 'a').strip()

            last_updated_str = html.parse_text_from_descendant_by_class(
                series_tile, self._SEARCH_RESULT_LAST_UPDATED_CLASS, 'span')
            series_blog.last_updated_datetime = (
                self._parse_search_result_datetime(last_updated_str))

            series_blogs.append(series_blog)

        return series_blogs