Esempio n. 1
0
    def _parse_series_intro(self, series_page_soup: BeautifulSoup,
                            series_blog: JpnArticleBlog) -> None:
        """Parse the intro and catchphrase for a series.

        Both the intro and catchphrase are optional, so a series might not have
        them set.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        catchphrase_tag = series_page_soup.find(id=self._CATCHPHRASE_TAG_ID)
        if catchphrase_tag is not None:
            series_blog.catchphrase = html.parse_valid_child_text(
                catchphrase_tag)

        intro_tag = series_page_soup.find(id=self._INTRO_TAG_ID)
        if intro_tag is not None:
            # Remove the expand button text from the end of the intro
            expand_button_span = intro_tag.find(
                'span', class_=self._INTRO_EXPAND_BUTTON_CLASS)
            if expand_button_span is not None:
                expand_button_span.decompose()

            series_blog.introduction = html.parse_valid_child_text(intro_tag)
Esempio n. 2
0
    def _parse_series_blog_info(self, series_page_soup: BeautifulSoup,
                                series_page_url: str) -> JpnArticleBlog:
        """Parse the blog info for a series from its homepage.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_page_url: Url for the series page to parse.

        Returns:
            A JpnArticleBlog with the info from the given series homepage.
        """
        series_blog = JpnArticleBlog(
            title=html.parse_text_from_descendant_by_id(
                series_page_soup, self._SERIES_TITLE_TAG_ID).strip(),
            author=html.parse_text_from_descendant_by_id(
                series_page_soup, self._SERIES_AUTHOR_TAG_ID).strip(),
            source_name=self.SOURCE_NAME,
            source_url=series_page_url,
        )

        self._parse_series_rating_info(series_page_soup, series_blog)
        self._parse_series_tags(series_page_soup, series_blog)
        self._parse_series_intro(series_page_soup, series_blog)
        self._parse_series_meta_info_list(series_page_soup, series_blog)
        self._parse_series_review_info_list(series_page_soup, series_blog)

        return series_blog
Esempio n. 3
0
    def _parse_series_tags(self, series_page_soup: BeautifulSoup,
                           series_blog: JpnArticleBlog) -> None:
        """Parse the tags for a series.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        series_blog.tags = []
        genre = html.parse_text_from_descendant_by_id(
            series_page_soup, self._SERIES_GENRE_TAG_ID)
        series_blog.tags.append(genre.strip())

        # If a series has no tags set for it, the tag div won't exist on the
        # series page.
        tag_div = series_page_soup.find(id=self._SERIES_TAG_DIV_ID)
        if tag_div is None:
            return

        tag_lists = tag_div.find_all('ul')
        for tag_list in tag_lists:
            for tag_element in tag_list.find_all('li'):
                series_blog.tags.append(
                    html.parse_valid_child_text(tag_element).strip())
Esempio n. 4
0
    def _parse_series_rating_info(self, series_page_soup: BeautifulSoup,
                                  series_blog: JpnArticleBlog) -> None:
        """Parse the rating info for a series.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        rating_str = html.parse_text_from_descendant_by_id(
            series_page_soup, self._SERIES_RATING_TAG_ID)
        series_blog.rating = float(re.sub('[^0-9]', '', rating_str))

        rating_count_str = html.parse_text_from_descendant_by_class(
            series_page_soup, self._SERIES_RATING_COUNT_CLASS, 'span')
        series_blog.rating_count = int(re.sub('[^0-9]', '', rating_count_str))
Esempio n. 5
0
def convert_docs_to_blogs(
    docs: List[Document]
) -> Dict[ObjectId, JpnArticleBlog]:
    """Convert MongoDB BSON documents to blog objects.

    Returns:
        A mapping from each blog document's MongoDB ObjectId to the created
        blog object for that blog document.
    """
    oid_blog_map = {}
    for doc in docs:
        oid_blog_map[doc['_id']] = JpnArticleBlog(
            title=doc['title'],
            author=doc['author'],
            source_name=doc['source_name'],
            source_url=doc['source_url'],
            publication_datetime=doc['publication_datetime'],
            last_updated_datetime=doc['last_updated_datetime'],
            rating=utils.float_or_none(doc['rating']),
            rating_count=utils.int_or_none(doc['rating_count']),
            tags=doc['tags'],
            catchphrase=doc.get('catchphrase'),
            introduction=doc.get('introduction'),
            article_count=utils.int_or_none(doc['article_count']),
            total_char_count=utils.int_or_none(doc['total_char_count']),
            comment_count=utils.int_or_none(doc['comment_count']),
            follower_count=utils.int_or_none(doc['follower_count']),
            in_serialization=doc['in_serialization'],
            last_crawled_datetime=doc.get('last_crawled_datetime'),
        )

    return oid_blog_map
Esempio n. 6
0
    def _parse_series_review_info_list(self, series_page_soup: BeautifulSoup,
                                       series_blog: JpnArticleBlog) -> None:
        """Parse the data in the review info list for a series.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        info_lists = html.select_descendants_by_class(
            series_page_soup, self._SERIES_INFO_LIST_CLASS, 'dl', 2)
        review_info_list = info_lists[1]

        comment_count_str = html.parse_desc_list_data_text(
            review_info_list, self._COMMENT_COUNT_TERM)
        series_blog.comment_count = self._parse_count_string(
            comment_count_str, self._COMMENT_COUNT_REGEX)

        follower_count_str = html.parse_desc_list_data_text(
            review_info_list, self._FOLLOWER_COUNT_TERM)
        series_blog.follower_count = self._parse_count_string(
            follower_count_str, self._FOLLOWER_COUNT_REGEX)
Esempio n. 7
0
    def _parse_series_meta_info_list(self, series_page_soup: BeautifulSoup,
                                     series_blog: JpnArticleBlog) -> None:
        """Parse the data in the meta info list for a series.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: The blog object to store the parsed data in.
        """
        info_lists = html.select_descendants_by_class(
            series_page_soup, self._SERIES_INFO_LIST_CLASS, 'dl', 2)
        meta_info_list = info_lists[0]

        publication_datetime_dd = html.select_desc_list_data(
            meta_info_list, self._PUBLICATION_DATETIME_TERM)
        series_blog.publication_datetime = html.parse_time_descendant(
            publication_datetime_dd)

        last_updated_datetime_dd = html.select_desc_list_data(
            meta_info_list, self._LAST_UPDATED_DATETIME_TERM)
        series_blog.last_updated_datetime = html.parse_time_descendant(
            last_updated_datetime_dd)

        article_count_str = html.parse_desc_list_data_text(
            meta_info_list, self._ARTICLE_COUNT_TERM)
        series_blog.article_count = self._parse_count_string(
            article_count_str, self._ARTICLE_COUNT_REGEX)

        total_char_count_str = html.parse_desc_list_data_text(
            meta_info_list, self._TOTAL_CHAR_COUNT_TERM)
        series_blog.total_char_count = self._parse_count_string(
            total_char_count_str, self._TOTAL_CHAR_COUNT_REGEX)

        serialization_status_str = html.parse_desc_list_data_text(
            meta_info_list, self._SERIALIZATION_STATUS_TERM)
        series_blog.in_serialization = (
            serialization_status_str == self._IN_SERIALIZATION_STATUS)
Esempio n. 8
0
    def _parse_search_results_page(
            self, page_soup: BeautifulSoup) -> List[JpnArticleBlog]:
        """Parse the series blog info from a search results page.

        Args:
            page_soup: A BeautifulSoup initialized with the content from a
                search page.

        Returns:
            A list of the series blog info for all of the series listed in the
            search results page.
        """
        series_blogs = []
        series_tiles = html.select_descendants_by_class(
            page_soup, self._SEARCH_RESULT_TILE_CLASS, 'div')
        _log.debug('Found %s series on search results page', len(series_tiles))

        for series_tile in series_tiles:
            series_blog = JpnArticleBlog(source_name=self.SOURCE_NAME)

            title_link_tag = html.select_one_descendant_by_class(
                series_tile, self._SEARCH_RESULT_TITLE_CLASS, 'a')
            series_blog.title = html.parse_valid_child_text(
                title_link_tag).strip()
            series_blog.source_url = utils.strip_url_query_and_frag(
                title_link_tag['href'])

            series_blog.author = html.parse_text_from_descendant_by_class(
                series_tile, self._SEARCH_RESULT_AUTHOR_CLASS, 'a').strip()

            last_updated_str = html.parse_text_from_descendant_by_class(
                series_tile, self._SEARCH_RESULT_LAST_UPDATED_CLASS, 'span')
            series_blog.last_updated_datetime = (
                self._parse_search_result_datetime(last_updated_str))

            series_blogs.append(series_blog)

        return series_blogs