def _parse_series_intro(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the intro and catchphrase for a series. Both the intro and catchphrase are optional, so a series might not have them set. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ catchphrase_tag = series_page_soup.find(id=self._CATCHPHRASE_TAG_ID) if catchphrase_tag is not None: series_blog.catchphrase = html.parse_valid_child_text( catchphrase_tag) intro_tag = series_page_soup.find(id=self._INTRO_TAG_ID) if intro_tag is not None: # Remove the expand button text from the end of the intro expand_button_span = intro_tag.find( 'span', class_=self._INTRO_EXPAND_BUTTON_CLASS) if expand_button_span is not None: expand_button_span.decompose() series_blog.introduction = html.parse_valid_child_text(intro_tag)
def _parse_series_blog_info(self, series_page_soup: BeautifulSoup, series_page_url: str) -> JpnArticleBlog: """Parse the blog info for a series from its homepage. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_page_url: Url for the series page to parse. Returns: A JpnArticleBlog with the info from the given series homepage. """ series_blog = JpnArticleBlog( title=html.parse_text_from_descendant_by_id( series_page_soup, self._SERIES_TITLE_TAG_ID).strip(), author=html.parse_text_from_descendant_by_id( series_page_soup, self._SERIES_AUTHOR_TAG_ID).strip(), source_name=self.SOURCE_NAME, source_url=series_page_url, ) self._parse_series_rating_info(series_page_soup, series_blog) self._parse_series_tags(series_page_soup, series_blog) self._parse_series_intro(series_page_soup, series_blog) self._parse_series_meta_info_list(series_page_soup, series_blog) self._parse_series_review_info_list(series_page_soup, series_blog) return series_blog
def _parse_series_tags(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the tags for a series. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ series_blog.tags = [] genre = html.parse_text_from_descendant_by_id( series_page_soup, self._SERIES_GENRE_TAG_ID) series_blog.tags.append(genre.strip()) # If a series has no tags set for it, the tag div won't exist on the # series page. tag_div = series_page_soup.find(id=self._SERIES_TAG_DIV_ID) if tag_div is None: return tag_lists = tag_div.find_all('ul') for tag_list in tag_lists: for tag_element in tag_list.find_all('li'): series_blog.tags.append( html.parse_valid_child_text(tag_element).strip())
def _parse_series_rating_info(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the rating info for a series. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ rating_str = html.parse_text_from_descendant_by_id( series_page_soup, self._SERIES_RATING_TAG_ID) series_blog.rating = float(re.sub('[^0-9]', '', rating_str)) rating_count_str = html.parse_text_from_descendant_by_class( series_page_soup, self._SERIES_RATING_COUNT_CLASS, 'span') series_blog.rating_count = int(re.sub('[^0-9]', '', rating_count_str))
def convert_docs_to_blogs( docs: List[Document] ) -> Dict[ObjectId, JpnArticleBlog]: """Convert MongoDB BSON documents to blog objects. Returns: A mapping from each blog document's MongoDB ObjectId to the created blog object for that blog document. """ oid_blog_map = {} for doc in docs: oid_blog_map[doc['_id']] = JpnArticleBlog( title=doc['title'], author=doc['author'], source_name=doc['source_name'], source_url=doc['source_url'], publication_datetime=doc['publication_datetime'], last_updated_datetime=doc['last_updated_datetime'], rating=utils.float_or_none(doc['rating']), rating_count=utils.int_or_none(doc['rating_count']), tags=doc['tags'], catchphrase=doc.get('catchphrase'), introduction=doc.get('introduction'), article_count=utils.int_or_none(doc['article_count']), total_char_count=utils.int_or_none(doc['total_char_count']), comment_count=utils.int_or_none(doc['comment_count']), follower_count=utils.int_or_none(doc['follower_count']), in_serialization=doc['in_serialization'], last_crawled_datetime=doc.get('last_crawled_datetime'), ) return oid_blog_map
def _parse_series_review_info_list(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the data in the review info list for a series. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ info_lists = html.select_descendants_by_class( series_page_soup, self._SERIES_INFO_LIST_CLASS, 'dl', 2) review_info_list = info_lists[1] comment_count_str = html.parse_desc_list_data_text( review_info_list, self._COMMENT_COUNT_TERM) series_blog.comment_count = self._parse_count_string( comment_count_str, self._COMMENT_COUNT_REGEX) follower_count_str = html.parse_desc_list_data_text( review_info_list, self._FOLLOWER_COUNT_TERM) series_blog.follower_count = self._parse_count_string( follower_count_str, self._FOLLOWER_COUNT_REGEX)
def _parse_series_meta_info_list(self, series_page_soup: BeautifulSoup, series_blog: JpnArticleBlog) -> None: """Parse the data in the meta info list for a series. Args: series_page_soup: A BeautifulSoup initialized with the content from a series homepage. series_blog: The blog object to store the parsed data in. """ info_lists = html.select_descendants_by_class( series_page_soup, self._SERIES_INFO_LIST_CLASS, 'dl', 2) meta_info_list = info_lists[0] publication_datetime_dd = html.select_desc_list_data( meta_info_list, self._PUBLICATION_DATETIME_TERM) series_blog.publication_datetime = html.parse_time_descendant( publication_datetime_dd) last_updated_datetime_dd = html.select_desc_list_data( meta_info_list, self._LAST_UPDATED_DATETIME_TERM) series_blog.last_updated_datetime = html.parse_time_descendant( last_updated_datetime_dd) article_count_str = html.parse_desc_list_data_text( meta_info_list, self._ARTICLE_COUNT_TERM) series_blog.article_count = self._parse_count_string( article_count_str, self._ARTICLE_COUNT_REGEX) total_char_count_str = html.parse_desc_list_data_text( meta_info_list, self._TOTAL_CHAR_COUNT_TERM) series_blog.total_char_count = self._parse_count_string( total_char_count_str, self._TOTAL_CHAR_COUNT_REGEX) serialization_status_str = html.parse_desc_list_data_text( meta_info_list, self._SERIALIZATION_STATUS_TERM) series_blog.in_serialization = ( serialization_status_str == self._IN_SERIALIZATION_STATUS)
def _parse_search_results_page( self, page_soup: BeautifulSoup) -> List[JpnArticleBlog]: """Parse the series blog info from a search results page. Args: page_soup: A BeautifulSoup initialized with the content from a search page. Returns: A list of the series blog info for all of the series listed in the search results page. """ series_blogs = [] series_tiles = html.select_descendants_by_class( page_soup, self._SEARCH_RESULT_TILE_CLASS, 'div') _log.debug('Found %s series on search results page', len(series_tiles)) for series_tile in series_tiles: series_blog = JpnArticleBlog(source_name=self.SOURCE_NAME) title_link_tag = html.select_one_descendant_by_class( series_tile, self._SEARCH_RESULT_TITLE_CLASS, 'a') series_blog.title = html.parse_valid_child_text( title_link_tag).strip() series_blog.source_url = utils.strip_url_query_and_frag( title_link_tag['href']) series_blog.author = html.parse_text_from_descendant_by_class( series_tile, self._SEARCH_RESULT_AUTHOR_CLASS, 'a').strip() last_updated_str = html.parse_text_from_descendant_by_class( series_tile, self._SEARCH_RESULT_LAST_UPDATED_CLASS, 'span') series_blog.last_updated_datetime = ( self._parse_search_result_datetime(last_updated_str)) series_blogs.append(series_blog) return series_blogs