def _parse_series_intro(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the intro and catchphrase for a series. Both the intro and catchphrase are optional, so a series might not have them set. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ catchphrase_tag = series_page_soup.find(id=self._CATCHPHRASE_TAG_ID) if catchphrase_tag is not None: series_blog.catchphrase = html.parse_valid_child_text( catchphrase_tag) intro_tag = series_page_soup.find(id=self._INTRO_TAG_ID) if intro_tag is not None: # Remove the expand button text from the end of the intro expand_button_span = intro_tag.find( 'span', class_=self._INTRO_EXPAND_BUTTON_CLASS) if expand_button_span is not None: expand_button_span.decompose() series_blog.introduction = html.parse_valid_child_text(intro_tag)
def _parse_body_div(self, tag: Tag) -> Optional[str]: """Parse the body text from a division of an NHK article. Args: tag: Tag containing a division of an NHK article. Returns: The parsed body text from tag. Raises: CannotParsePageError: There was an error parsing the body text from tag. """ section_text = html.parse_valid_child_text(tag, False) if section_text is not None: return section_text text_sections = [] for child in tag.children: # Skip text around child tags such as '\n' if child.name is None: continue child_text = html.parse_valid_child_text(child, False) if child_text is None: continue if len(child_text) > 0: text_sections.append(child_text) return '\n'.join(text_sections) if len(text_sections) > 0 else None
def _parse_episode_text(self, episode_page_soup: BeautifulSoup) -> str: """Parse the full text for an episode. Args: episode_page_soup: A BeautifulSoup initialized with the content from an episode page. Returns: The full text for the episode. """ body_text_list = [] title = html.parse_text_from_descendant_by_class( episode_page_soup, self._EPISODE_TITLE_CLASS, 'p') body_text_list.append(title.strip()) body_text_list.append('') # Add extra new line after title body_text_div = html.select_one_descendant_by_class( episode_page_soup, self._EPISODE_TEXT_DIV_CLASS, 'div') body_text_paras = html.select_descendants_by_tag(body_text_div, 'p') for body_text_para in body_text_paras: para_text = html.parse_valid_child_text(body_text_para, False) if para_text is None: body_text_list.append('') else: body_text_list.append(para_text) return '\n'.join(body_text_list)
def _parse_series_tags(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the tags for a series. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ series_blog.tags = [] genre = html.parse_text_from_descendant_by_id( series_page_soup, self._SERIES_GENRE_TAG_ID) series_blog.tags.append(genre.strip()) # If a series has no tags set for it, the tag div won't exist on the # series page. tag_div = series_page_soup.find(id=self._SERIES_TAG_DIV_ID) if tag_div is None: return tag_lists = tag_div.find_all('ul') for tag_list in tag_lists: for tag_element in tag_list.find_all('li'): series_blog.tags.append( html.parse_valid_child_text(tag_element).strip())
def _parse_series_episode_metadatas( self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> List[JpnArticle]: """Parse the episode metadatas for a series from its homepage. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: Blog info for this series. Returns: A list of the article metadatas for all episodes listed on the series homepage. """ table_of_contents_items = self._select_table_of_contents_items( series_page_soup) article_metas = [] ep_order_num = 1 section_order_num = 0 section_ep_order_num = 1 section_name = None for item in table_of_contents_items: if self._is_section_li(item): section_order_num += 1 section_ep_order_num = 1 section_name = html.parse_valid_child_text(item).strip() elif self._is_episode_li(item): article_meta = self._parse_table_of_contents_episode( item, series_blog, ep_order_num, section_name, section_order_num, section_ep_order_num) article_metas.append(article_meta) ep_order_num += 1 section_ep_order_num += 1 else: utils.log_and_raise( _log, HtmlParsingError, 'Unrecognized list item "{}" in table of contents: ' '"{}"'.format(item, series_page_soup)) return article_metas
def _parse_search_results_page( self, page_soup: BeautifulSoup) -> List[JpnArticleBlog]: """Parse the series blog info from a search results page. Args: page_soup: A BeautifulSoup initialized with the content from a search page. Returns: A list of the series blog info for all of the series listed in the search results page. """ series_blogs = [] series_tiles = html.select_descendants_by_class( page_soup, self._SEARCH_RESULT_TILE_CLASS, 'div') _log.debug('Found %s series on search results page', len(series_tiles)) for series_tile in series_tiles: series_blog = JpnArticleBlog(source_name=self.SOURCE_NAME) title_link_tag = html.select_one_descendant_by_class( series_tile, self._SEARCH_RESULT_TITLE_CLASS, 'a') series_blog.title = html.parse_valid_child_text( title_link_tag).strip() series_blog.source_url = utils.strip_url_query_and_frag( title_link_tag['href']) series_blog.author = html.parse_text_from_descendant_by_class( series_tile, self._SEARCH_RESULT_AUTHOR_CLASS, 'a').strip() last_updated_str = html.parse_text_from_descendant_by_class( series_tile, self._SEARCH_RESULT_LAST_UPDATED_CLASS, 'span') series_blog.last_updated_datetime = ( self._parse_search_result_datetime(last_updated_str)) series_blogs.append(series_blog) return series_blogs