Example #1
0
    def parse_entry_media_attributes(self, soup):
        """
      Args:
        soup: a bs4 element containing a row from the current media list

      Return a dict of attributes of the media the row is about.
    """
        row_info = {}

        try:
            start = utilities.parse_profile_date(
                soup.find('series_start').text)
        except ValueError:
            start = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        if start is not None:
            try:
                row_info['aired'] = (start,
                                     utilities.parse_profile_date(
                                         soup.find('series_end').text))
            except ValueError:
                row_info['aired'] = (start, None)
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

        # look up the given media type's status terms.
        status_terms = getattr(self.session, self.type)(1)._status_terms

        try:
            row_info['id'] = int(
                soup.find('series_' + self.type + 'db_id').text)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['title'] = soup.find('series_title').text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['status'] = status_terms[int(
                soup.find('series_status').text)]
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['picture'] = soup.find('series_image').text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return row_info
Example #2
0
    def parse_entry(self, soup):
        """
      Given:
        soup: a bs4 element containing a row from the current media list

      Return a tuple:
        (media object, dict of this row's parseable attributes)
    """
        # parse the media object first.
        media_attrs = self.parse_entry_media_attributes(soup)
        media_id = media_attrs[u'id']
        del media_attrs[u'id']
        media = getattr(self.session, self.type)(media_id).set(media_attrs)

        entry_info = {}
        try:
            entry_info[u'started'] = utilities.parse_profile_date(
                soup.find(u'my_start_date').text)
        except ValueError:
            entry_info[u'started'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'finished'] = utilities.parse_profile_date(
                soup.find(u'my_finish_date').text)
        except ValueError:
            entry_info[u'finished'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'status'] = self.user_status_terms[int(
                soup.find(u'my_status').text)]
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'score'] = int(soup.find(u'my_score').text)
            # if user hasn't set a score, set it to None to indicate as such.
            if entry_info[u'score'] == 0:
                entry_info[u'score'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'last_updated'] = datetime.datetime.fromtimestamp(
                int(soup.find(u'my_last_updated').text))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media, entry_info
Example #3
0
    def parse_entry_media_attributes(self, soup):
        """
          Args:
            soup: a bs4 element containing a row from the current media list

          Return a dict of attributes of the media the row is about.
        """
        row_info = {}

        try:
            start = utilities.parse_profile_date(soup.find('series_start').text)
        except ValueError:
            start = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        if start is not None:
            try:
                row_info['aired'] = (start, utilities.parse_profile_date(soup.find('series_end').text))
            except ValueError:
                row_info['aired'] = (start, None)
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

        # look up the given media type's status terms.
        status_terms = getattr(self.session, self.type)(1)._status_terms

        try:
            row_info['id'] = int(soup.find('series_' + self.type + 'db_id').text)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['title'] = soup.find('series_title').text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['status'] = status_terms[int(soup.find('series_status').text)]
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            row_info['picture'] = soup.find('series_image').text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return row_info
Example #4
0
    def parse_entry(self, soup):
        """
          Given:
            soup: a bs4 element containing a row from the current media list

          Return a tuple:
            (media object, dict of this row's parseable attributes)
        """
        # parse the media object first.
        media_attrs = self.parse_entry_media_attributes(soup)
        media_id = media_attrs[u'id']
        del media_attrs[u'id']
        media = getattr(self.session, self.type)(media_id).set(media_attrs)

        entry_info = {}
        try:
            entry_info[u'started'] = utilities.parse_profile_date(soup.find(u'my_start_date').text)
        except ValueError:
            entry_info[u'started'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'finished'] = utilities.parse_profile_date(soup.find(u'my_finish_date').text)
        except ValueError:
            entry_info[u'finished'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'status'] = self.user_status_terms[int(soup.find(u'my_status').text)]
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'score'] = int(soup.find(u'my_score').text)
            # if user hasn't set a score, set it to None to indicate as such.
            if entry_info[u'score'] == 0:
                entry_info[u'score'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            entry_info[u'last_updated'] = datetime.datetime.fromtimestamp(int(soup.find(u'my_last_updated').text))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media, entry_info
Example #5
0
    def _parse_last_list_updates(self, user_page):
        """parse user last media update (manga and anime)."""
        try:
            divs_zip = []
            for mode in ['anime', 'manga']:
                divs = user_page.select('div.updates.{}'.format(mode))[0].select('div')
                divs_zip.append((mode, divs))

            media_list = {}
            for mode, divs in divs_zip:
                for div in divs:
                    # parse the media
                    try:
                        media_link_tag = div.select('a')[0]
                    except IndexError:
                        continue
                    media_link = media_link_tag.get('href')
                    media_id = int(media_link.split('/{}/'.format(mode))[1].split('/')[0])
                    media_title = media_link_tag.text
                    media = getattr(self.session, mode)(media_id).set({'title': media_title})
                    # stats
                    status_tag = div.select('div')[-1]
                    status = status_tag.text
                    # update_date
                    date_tag = div.select('div span')[0]
                    update_date = utilities.parse_profile_date(date_tag.text)
                    # first media list dict
                    media_list[media] = self._parse_update_media_status(status)
                    # add more key and item
                    media_list[media]['time'] = update_date

            return media_list
        except:
            if not self.session.suppress_parse_exceptions:
                raise
Example #6
0
    def _get_user_stats(self, user_page, stats_type, type_txt):
        """get user stats."""
        assert stats_type in ['birthday', 'last_online', 'gender', 'join_date', 'location']

        # get tags and try to filter it
        user_stats_tag = user_page.select_one('.user-status')
        stats_tags = [
            x for x in user_stats_tag.select('li > span')
            if type_txt in x.text
        ]

        # return default value if nothing found
        if not stats_tags:
            if stats_type == 'gender':
                return 'Not specified'
            else:
                return None

        # process the html tag
        stats_tag = stats_tags[0].parent
        stats_text = stats_tag.text.split(type_txt)[1].strip()

        # parse the end result based on stats type
        if stats_type in ['birthday', 'last_online', 'join_date']:
            return utilities.parse_profile_date(stats_text)
        else:
            return stats_text
Example #7
0
 def parse_media(user_page, mode='anime'):
     # get div tags
     divs = user_page.select(
         'div.updates.{}'.format(mode))[0].select('div')
     media_list = {}
     for div in divs:
         # parse the media
         try:
             media_link_tag = div.select('a')[0]
         except IndexError:
             continue
         media_link = media_link_tag.get('href')
         media_id = int(
             media_link.split('/{}/'.format(mode))[1].split('/')[0])
         media_title = media_link_tag.text
         media = getattr(self.session,
                         mode)(media_id).set({'title': media_title})
         # stats
         status_tag = div.select('div')[-1]
         status = status_tag.text
         # update_date
         date_tag = div.select('div span')[0]
         update_date = utilities.parse_profile_date(date_tag.text)
         media_list[media] = self._parse_update_media_status(
             status)  # first media list dict
         media_list[media][
             'time'] = update_date  # add more key and item
     return media_list
Example #8
0
 def _parse_sidebar_user_status(self, user_page):
     """Parse the DOM and return user status on sidebar."""
     user_info = {}
     top_section = self._parse_sidebar_user_status_top_section(user_page)
     # variable for easier key on key comparator
     parse_date = 'parse_date'
     user_info_key = 'user_info_key'
     # top part of side bar
     key_comparator = {
         'last online': {
             user_info_key: u'last_online',
             parse_date: True
         },
         'gender': {
             user_info_key: u'gender',
             parse_date: False
         },
         'birthday': {
             user_info_key: u'birthday',
             parse_date: True
         },
         'location': {
             user_info_key: u'location',
             parse_date: False
         },
         'joined': {
             user_info_key: u'join_date',
             parse_date: True
         },
     }
     # bottom part of sidebar
     bottom_part_key = [
         ['forum posts', u'num_forum_posts'],
         ['reviews', u'num_reviews'],
         ['recommendations', u'num_recommendations'],
         ['blog posts', u'num_blog_posts'],
         ['clubs', u'num_clubs'],
     ]
     for key in bottom_part_key:  # add bottom part key to key_comparator
         key_comparator[key[0]] = {user_info_key: key[1], parse_date: False}
     # convert top section dictionary into user info dict
     for keyc in key_comparator:
         if keyc in top_section:
             if key_comparator[keyc][parse_date]:
                 user_info[key_comparator[keyc]
                           [user_info_key]] = utilities.parse_profile_date(
                               top_section[keyc])
             else:
                 user_info[key_comparator[keyc]
                           [user_info_key]] = top_section[keyc]
     # fix bottom part keys on user info.
     # remove the comma and convert into integer
     for key in bottom_part_key:
         try:
             if user_info[key[1]] is not None and type(
                     user_info[key[1]]) != int:
                 user_info[key[1]] = int(user_info[key[1]].replace(',', ''))
         except KeyError:
             pass  # pass for unsuspected keyerror
     return user_info
Example #9
0
    def parse_friends(self, friends_page):
        """Parse the DOM and returns user friends attributes.

        :type friends_page: :class:`bs4.BeautifulSoup`
        :param friends_page: MAL user friends page's DOM

        :rtype: dict
        :return: User friends attributes.

        """
        user_info = self.parse_sidebar(friends_page)
        second_col = (
            friends_page
            .find(u'div', {u'id': u'content'})
            .find(u'table')
            .find(u'tr')
            .find_all(u'td', recursive=False)[1]
        )

        try:
            user_info['friends'] = {}

            friends = second_col.find_all('div', {'class': 'friendHolder'})
            if friends:
                for row in friends:
                    block = row.find('div', {'class': 'friendBlock'})
                    cols = block.find_all('div')

                    friend_link = cols[1].find('a')
                    friend = self.session.user(friend_link.text)

                    friend_info = {}
                    if len(cols) > 2 and cols[2].text != u'':
                        col_txt = cols[2].text.strip()
                        friend_info[u'last_active'] = utilities.parse_profile_date(col_txt)

                    if len(cols) > 3 and cols[3].text != '':
                        friend_info['since'] = utilities.parse_profile_date(
                            cols[3].text.replace('Friends since', '').strip())
                    user_info['friends'][friend] = friend_info
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #10
0
    def parse_recommendations(self, recommendations_page):
        """Parse the DOM and returns user recommendations attributes.

        :type recommendations_page: :class:`bs4.BeautifulSoup`
        :param recommendations_page: MAL user recommendations page's DOM

        :rtype: dict
        :return: User recommendations attributes.

        """
        user_info = self.parse_sidebar(recommendations_page)
        second_col = (
            recommendations_page
            .find(u'div', {u'id': u'content'})
            .find(u'table')
            .find(u'tr')
            .find_all(u'td', recursive=False)[1]
        )

        try:
            recommendations = second_col.find_all("div", {"class": "spaceit borderClass"})
            if recommendations:
                user_info['recommendations'] = {}
                for row in recommendations[1:]:
                    anime_table = row.find(u'table')
                    animes = anime_table.find_all(u'td')
                    # find liked media
                    liked_media_link = animes[0].find(u'a', recursive=False)
                    link_parts = liked_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    liked_media = getattr(self.session, link_parts[1])(int(link_parts[2])).set(
                        {u'title': liked_media_link.text}
                    )
                    # find recommended media
                    recommended_media_link = animes[1].find(u'a', recursive=False)
                    link_parts = recommended_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    media_id = int(link_parts[2])
                    recommended_media = getattr(self.session, link_parts[1])(media_id).set(
                        {u'title': recommended_media_link.text}
                    )
                    # other stats from recommended media
                    recommendation_text = row.find(u'p').text
                    recommendation_menu = row.find(u'div', recursive=False)
                    utilities.extract_tags(recommendation_menu)
                    rec_menu_text = recommendation_menu.text.split(u' - ')[1]
                    recommendation_date = utilities.parse_profile_date(rec_menu_text)

                    user_info['recommendations'][liked_media] = {link_parts[1]: recommended_media,
                                                                  'text': recommendation_text,
                                                                  'date': recommendation_date}
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #11
0
    def parse_recommendations(self, recommendations_page):
        """Parses the DOM and returns user recommendations attributes.

    :type recommendations_page: :class:`bs4.BeautifulSoup`
    :param recommendations_page: MAL user recommendations page's DOM

    :rtype: dict
    :return: User recommendations attributes.

    """
        user_info = self.parse_sidebar(recommendations_page)
        second_col = recommendations_page.find(u'div', {
            u'id': u'content'
        }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1]

        try:
            recommendations = second_col.find_all(
                u"div", {u"class": u"spaceit borderClass"})
            if recommendations:
                user_info[u'recommendations'] = {}
                for row in recommendations[1:]:
                    anime_table = row.find(u'table')
                    animes = anime_table.find_all(u'td')
                    liked_media_link = animes[0].find(u'a', recursive=False)
                    link_parts = liked_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    liked_media = getattr(self.session, link_parts[1])(int(
                        link_parts[2])).set({u'title': liked_media_link.text})

                    recommended_media_link = animes[1].find(u'a',
                                                            recursive=False)
                    link_parts = recommended_media_link.get(u'href').split(
                        u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    recommended_media = getattr(self.session, link_parts[1])(
                        int(link_parts[2])).set(
                            {u'title': recommended_media_link.text})

                    recommendation_text = row.find(u'p').text

                    recommendation_menu = row.find(u'div', recursive=False)
                    utilities.extract_tags(recommendation_menu)
                    recommendation_date = utilities.parse_profile_date(
                        recommendation_menu.text.split(u' - ')[1])

                    user_info[u'recommendations'][liked_media] = {
                        link_parts[1]: recommended_media,
                        'text': recommendation_text,
                        'date': recommendation_date
                    }
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #12
0
    def parse_friends(self, friends_page):
        """Parses the DOM and returns user friends attributes.

    :type friends_page: :class:`bs4.BeautifulSoup`
    :param friends_page: MAL user friends page's DOM

    :rtype: dict
    :return: User friends attributes.

    """
        user_info = self.parse_sidebar(friends_page)
        second_col = friends_page.find(u'div', {
            u'id': u'content'
        }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1]

        try:
            user_info[u'friends'] = {}

            friends = second_col.find_all(u'div', {u'class': u'friendHolder'})
            if friends:
                for row in friends:
                    block = row.find(u'div', {u'class': u'friendBlock'})
                    cols = block.find_all(u'div')

                    friend_link = cols[1].find(u'a')
                    friend = self.session.user(friend_link.text)

                    friend_info = {}
                    if len(cols) > 2 and cols[2].text != u'':
                        friend_info[
                            u'last_active'] = utilities.parse_profile_date(
                                cols[2].text.strip())

                    if len(cols) > 3 and cols[3].text != u'':
                        friend_info[u'since'] = utilities.parse_profile_date(
                            cols[3].text.replace(u'Friends since', '').strip())
                    user_info[u'friends'][friend] = friend_info
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #13
0
  def parse_reviews(self, reviews_page):
    """Parses the DOM and returns user reviews attributes.

    :type reviews_page: :class:`bs4.BeautifulSoup`
    :param reviews_page: MAL user reviews page's DOM

    :rtype: dict
    :return: User reviews attributes.

    """
    user_info = self.parse_sidebar(reviews_page)
    second_col = reviews_page.find(u'div', {u'id': u'content'}).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1]

    try:
      user_info[u'reviews'] = {}
      reviews = second_col.find_all(u'div', {u'class': u'borderDark'}, recursive=False)
      if reviews:
        for row in reviews:
          review_info = {}
          try:
            (meta_elt, review_elt) = row.find_all(u'div', recursive=False)[0:2]
          except ValueError:
            raise
          meta_rows = meta_elt.find_all(u'div', recursive=False)
          review_info[u'date'] = utilities.parse_profile_date(meta_rows[0].find(u'div').text)
          media_link = meta_rows[0].find(u'a')
          link_parts = media_link.get(u'href').split(u'/')
          # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo
          media = getattr(self.session, link_parts[1])(int(link_parts[2])).set({u'title': media_link.text})

          helpfuls = meta_rows[1].find(u'span', recursive=False)
          helpful_match = re.match(r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)', helpfuls.text).groupdict()
          review_info[u'people_helped'] = int(helpful_match[u'people_helped'])
          review_info[u'people_total'] = int(helpful_match[u'people_total'])

          consumption_match = re.match(r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)', meta_rows[2].text).groupdict()
          review_info[u'media_consumed'] = int(consumption_match[u'media_consumed'])
          if consumption_match[u'media_total'] == u'?':
            review_info[u'media_total'] = None
          else:
            review_info[u'media_total'] = int(consumption_match[u'media_total'])

          review_info[u'rating'] = int(meta_rows[3].find(u'div').text.replace(u'Overall Rating: ', ''))

          for x in review_elt.find_all([u'div', 'a']):
            x.extract()
          review_info[u'text'] = review_elt.text.strip()
          user_info[u'reviews'][media] = review_info
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return user_info
Example #14
0
  def parse_reviews(self, reviews_page):
    """Parses the DOM and returns user reviews attributes.

    :type reviews_page: :class:`bs4.BeautifulSoup`
    :param reviews_page: MAL user reviews page's DOM

    :rtype: dict
    :return: User reviews attributes.

    """
    user_info = self.parse_sidebar(reviews_page)
    second_col = reviews_page.find(u'div', {u'id': u'content'}).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1]

    try:
      user_info[u'reviews'] = {}
      reviews = second_col.find_all(u'div', {u'class': u'borderDark'}, recursive=False)
      if reviews:
        for row in reviews:
          review_info = {}
          try:
            (meta_elt, review_elt) = row.find_all(u'div', recursive=False)[0:2]
          except ValueError:
            raise
          meta_rows = meta_elt.find_all(u'div', recursive=False)
          review_info[u'date'] = utilities.parse_profile_date(meta_rows[0].find(u'div').text)
          media_link = meta_rows[0].find(u'a')
          link_parts = media_link.get(u'href').split(u'/')
          # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo
          media = getattr(self.session, link_parts[1])(int(link_parts[2])).set({u'title': media_link.text})

          helpfuls = meta_rows[1].find(u'span', recursive=False)
          helpful_match = re.match(r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)', helpfuls.text).groupdict()
          review_info[u'people_helped'] = int(helpful_match[u'people_helped'])
          review_info[u'people_total'] = int(helpful_match[u'people_total'])

          consumption_match = re.match(r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)', meta_rows[2].text).groupdict()
          review_info[u'media_consumed'] = int(consumption_match[u'media_consumed'])
          if consumption_match[u'media_total'] == u'?':
            review_info[u'media_total'] = None
          else:
            review_info[u'media_total'] = int(consumption_match[u'media_total'])

          review_info[u'rating'] = int(meta_rows[3].find(u'div').text.replace(u'Overall Rating: ', ''))

          for x in review_elt.find_all([u'div', 'a']):
            x.extract()
          review_info[u'text'] = review_elt.text.strip()
          user_info[u'reviews'][media] = review_info
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return user_info
Example #15
0
 def _parse_sidebar_user_status(self, user_page):
     """Parse the DOM and return user status on sidebar."""
     user_info = {}
     top_section = self._parse_sidebar_user_status_top_section(user_page)
     # variable for easier key on key comparator
     parse_date = 'parse_date'
     user_info_key = 'user_info_key'
     # top part of side bar
     key_comparator = {
         'last online': {user_info_key: 'last_online', parse_date: True},
         'gender': {user_info_key: 'gender', parse_date: False},
         'birthday': {user_info_key: 'birthday', parse_date: True},
         'location': {user_info_key: 'location', parse_date: False},
         'joined': {user_info_key: 'join_date', parse_date: True},
     }
     # bottom part of sidebar
     bottom_part_key = [
         ['forum posts', 'num_forum_posts'],
         ['reviews', 'num_reviews'],
         ['recommendations', 'num_recommendations'],
         ['blog posts', 'num_blog_posts'],
         ['clubs', 'num_clubs'],
     ]
     for key in bottom_part_key:  # add bottom part key to key_comparator
         key_comparator[key[0]] = {user_info_key: key[1], parse_date: False}
     # convert top section dictionary into user info dict
     for keyc in key_comparator:
         if keyc in top_section:
             if key_comparator[keyc][parse_date]:
                 user_info[key_comparator[keyc][user_info_key]] = utilities.parse_profile_date(
                     top_section[keyc])
             else:
                 user_info[key_comparator[keyc][user_info_key]] = top_section[keyc]
     # fix bottom part keys on user info.
     # remove the comma and convert into integer
     for key in bottom_part_key:
         try:
             if user_info[key[1]] is not None and type(user_info[key[1]]) != int:
                 user_info[key[1]] = int(user_info[key[1]].replace(',', ''))
         except KeyError:
             pass  # pass for unsuspected keyerror
     return user_info
Example #16
0
 def parse_media(user_page, mode='anime'):
     # get div tags
     divs = user_page.select('div.updates.{}'.format(mode))[0].select('div')
     media_list = {}
     for div in divs:
         # parse the media
         try:
             media_link_tag = div.select('a')[0]
         except IndexError:
             continue
         media_link = media_link_tag.get('href')
         media_id = int(media_link.split('/{}/'.format(mode))[1].split('/')[0])
         media_title = media_link_tag.text
         media = getattr(self.session, mode)(media_id).set({'title': media_title})
         # stats
         status_tag = div.select('div')[-1]
         status = status_tag.text
         # update_date
         date_tag = div.select('div span')[0]
         update_date = utilities.parse_profile_date(date_tag.text)
         media_list[media] = self._parse_update_media_status(status)  # first media list dict
         media_list[media]['time'] = update_date  # add more key and item
     return media_list
Example #17
0
    def parse_sidebar(self, anime_page):
        """Parses the DOM and returns anime attributes in the sidebar.

    :type anime_page: :class:`bs4.BeautifulSoup`
    :param anime_page: MAL anime page's DOM

    :rtype: dict
    :return: anime attributes

    :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
    """

        try:
            anime_info = super(Anime, self).parse_sidebar(anime_page)
        except media.InvalidMediaError as e:
            raise InvalidAnimeError(e.id)
        info_panel_first = anime_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            episode_tag = info_panel_first.find(
                text=u'Episodes:').parent.parent
            utilities.extract_tags(
                episode_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'episodes'] = int(episode_tag.text.strip(
            )) if episode_tag.text.strip() != 'Unknown' else 0
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = info_panel_first.find(text=u'Aired:').parent.parent
            utilities.extract_tags(
                aired_tag.find_all(u'span', {'class': 'dark_text'}))
            aired_parts = aired_tag.text.strip().split(u' to ')
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse single air date")
                anime_info[u'aired'] = (aired_date, )
            else:
                # two airing dates.
                try:
                    air_start = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse first of two air dates")
                try:
                    air_end = utilities.parse_profile_date(
                        aired_parts[1],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[1],
                        message="Could not parse second of two air dates")
                anime_info[u'aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            producers_tag = info_panel_first.find(
                text=u'Producers:').parent.parent
            utilities.extract_tags(
                producers_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'producers'] = []
            for producer_link in producers_tag.find_all('a'):
                if producer_link.text == u'add some':
                    # MAL is saying "None found, add some".
                    break
                link_parts = producer_link.get('href').split('/')
                # of the form: /anime/producer/23/Bandai_Visual
                anime_info[u'producers'].append(
                    self.session.producer(int(link_parts[3])).set(
                        {'name': producer_link.text}))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = info_panel_first.find(
                text=u'Duration:').parent.parent
            utilities.extract_tags(
                duration_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'duration'] = duration_tag.text.strip()
            duration_parts = [
                part.strip() for part in anime_info[u'duration'].split(u'.')
            ]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match(u'(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group(u'num'))
                if part.endswith(u'hr'):
                    duration_mins += part_volume * 60
                elif part.endswith(u'min'):
                    duration_mins += part_volume
            anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = info_panel_first.find(text=u'Rating:').parent.parent
            utilities.extract_tags(
                rating_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
Example #18
0
    def parse_reviews(self, reviews_page):
        """Parse the DOM and returns user reviews attributes.

        :type reviews_page: :class:`bs4.BeautifulSoup`
        :param reviews_page: MAL user reviews page's DOM

        :rtype: dict
        :return: User reviews attributes.

        """
        user_info = self.parse_sidebar(reviews_page)
        second_col = \
            reviews_page.find(u'div', {u'id': u'content'}).find(u'table').find(u'tr') \
            .find_all(u'td', recursive=False)[1]
        try:
            user_info[u'reviews'] = {}
            reviews = second_col.find_all(u'div', {u'class': u'borderDark'},
                                          recursive=False)
            if reviews:
                for row in reviews:
                    review_info = {}
                    try:
                        (meta_elt,
                         review_elt) = row.find_all(u'div',
                                                    recursive=False)[0:2]
                    except ValueError:
                        raise
                    meta_rows = meta_elt.find_all(u'div', recursive=False)
                    review_info[u'date'] = utilities.parse_profile_date(
                        meta_rows[0].find(u'div').text)
                    media_link = meta_rows[0].find(u'a')
                    link_parts = media_link.get(u'href').split(u'/')
                    # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo
                    media_id = link_parts[2]
                    media = getattr(self.session, link_parts[1])(
                        int(media_id)).set({u'title': media_link.text})

                    helpfuls = meta_rows[1].find(u'span', recursive=False)
                    try:
                        regex_str = r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)'
                        helpful_match = re.match(regex_str,
                                                 helpfuls.text).groupdict()
                        review_info[u'people_helped'] = int(
                            helpful_match[u'people_helped'])
                        review_info[u'people_total'] = int(
                            helpful_match[u'people_total'])
                    except AttributeError:
                        # total of people is no longer shown
                        # try another method, not using regex method.
                        # ie: 805 people found this review helpful
                        helpful_match = helpfuls.text.split(
                            'people found this review helpful')[0]
                        review_info[u'people_helped'] = int(helpful_match)
                        # review_info[u'people_total'] = int(helpful_match[u'people_total'])
                        review_info[u'people_total'] = None

                    try:
                        regex_str = r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)'
                        consumption_match = re.match(
                            regex_str, meta_rows[2].text).groupdict()
                        review_info[u'media_consumed'] = int(
                            consumption_match[u'media_consumed'])
                        if consumption_match[u'media_total'] == u'?':
                            review_info[u'media_total'] = None
                        else:
                            review_info[u'media_total'] = int(
                                consumption_match[u'media_total'])
                    except AttributeError:
                        # available format
                        # ie anime: 25 of 25 episodes seen
                        # ie : 25 of ? episodes seen
                        # ie : ? episodes
                        # ie manga: 40 chapters
                        # ie : 60 of ? chapters read
                        # ie : ? chapters
                        # <div class="lightLink" style="float: right;">24 of 24 episodes seen</div>

                        media_tag = meta_rows[1].find_all('div')[0]
                        if ' episodes' in media_tag.text:
                            user_media_consumption = media_tag.text.split(
                                ' episodes')[0].strip()
                        elif ' chapters' in media_tag.text:
                            user_media_consumption = media_tag.text.split(
                                ' chapters')[0].strip()
                        else:
                            # no format recognized
                            raise AttributeError
                        # user_media_consumption : 'xx of xx', 'xx of ?', '? of xx', or '?'
                        if 'of' not in user_media_consumption:
                            review_info[u'media_consumed'] = None
                            review_info[u'media_total'] = None
                        else:
                            # temp var for variable media_consumed
                            temp_consumed = user_media_consumption.split(
                                'of')[0].strip()
                            # temp var for variable media_total
                            temp_total = user_media_consumption.split(
                                'of')[1].strip()
                            if temp_consumed == '?':
                                review_info[u'media_consumed'] = None
                            else:
                                review_info[u'media_consumed'] = int(
                                    temp_consumed)
                            if temp_total == '?':
                                review_info[u'media_total'] = None
                            else:
                                review_info[u'media_total'] = int(temp_total)

                    review_info[u'rating'] = int(meta_rows[2].text.replace(
                        u'Overall Rating: ', '').split('Other review')[0])

                    for x in review_elt.find_all([u'div', 'a']):
                        x.extract()

                    try:
                        review_info[u'text'] = review_elt.text.strip()
                    except AttributeError:
                        # sometime reviw_elt cant produce attribute error
                        # one of the solution is to reparse the tag
                        review_info[u'text'] = BeautifulSoup(
                            str(review_elt), "lxml").text.strip()

                    user_info[u'reviews'][media] = review_info
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #19
0
  def parse(self, user_page):
    """Parses the DOM and returns user attributes in the main-content area.

    :type user_page: :class:`bs4.BeautifulSoup`
    :param user_page: MAL user page's DOM

    :rtype: dict
    :return: User attributes.

    """
    user_info = self.parse_sidebar(user_page)

    section_headings = user_page.find_all(u'div', {u'class': u'normal_header'})

    # parse general details.
    # we have to work from the bottom up, since there's broken HTML after every header.
    last_online_elt = user_page.find(u'td', text=u'Last Online')
    if last_online_elt:
      try:
        general_table = last_online_elt.parent.parent
      except:
        if not self.session.suppress_parse_exceptions:
          raise

      if general_table and general_table.name == u'table':
        try:
          last_online_elt = general_table.find(u'td', text=u'Last Online')
          if last_online_elt:
            user_info[u'last_online'] = utilities.parse_profile_date(last_online_elt.findNext(u'td').text)
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          gender = general_table.find(u'td', text=u'Gender')
          if gender:
            user_info[u'gender'] = gender.findNext(u'td').text
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          birthday = general_table.find(u'td', text=u'Birthday')
          if birthday:
            user_info[u'birthday'] = utilities.parse_profile_date(birthday.findNext(u'td').text)
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          location = general_table.find(u'td', text=u'Location')
          if location:
            user_info[u'location'] = location.findNext(u'td').text
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          website = general_table.find(u'td', text=u'Website')
          if website:
            user_info[u'website'] = website.findNext(u'td').text
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          join_date = general_table.find(u'td', text=u'Join Date')
          if join_date:
            user_info[u'join_date'] = utilities.parse_profile_date(join_date.findNext(u'td').text)
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          access_rank = general_table.find(u'td', text=u'Access Rank')
          if access_rank:
            user_info[u'access_rank'] = access_rank.findNext(u'td').text
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          anime_list_views = general_table.find(u'td', text=u'Anime List Views')
          if anime_list_views:
            user_info[u'anime_list_views'] = int(anime_list_views.findNext(u'td').text.replace(',', ''))
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          manga_list_views = general_table.find(u'td', text=u'Manga List Views')
          if manga_list_views:
            user_info[u'manga_list_views'] = int(manga_list_views.findNext(u'td').text.replace(',', ''))
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          num_comments = general_table.find(u'td', text=u'Comments')
          if num_comments:
            user_info[u'num_comments'] = int(num_comments.findNext(u'td').text.replace(',', ''))
        except:
          if not self.session.suppress_parse_exceptions:
            raise

        try:
          num_forum_posts = general_table.find(u'td', text=u'Forum Posts')
          if num_forum_posts:
            user_info[u'num_forum_posts'] = int(num_forum_posts.findNext(u'td').text.replace(" (Find All)", "").replace(',', ''))
        except:
          if not self.session.suppress_parse_exceptions:
            raise

    try:
      # last list updates.
      list_updates_header = filter(lambda x: u'Last List Updates' in x.text, section_headings)
      if list_updates_header:
        list_updates_header = list_updates_header[0]
        list_updates_table = list_updates_header.findNext(u'table')
        if list_updates_table:
          user_info[u'last_list_updates'] = {}
          for row in list_updates_table.find_all(u'tr'):
            cols = row.find_all(u'td')
            info_col = cols[1]
            media_link = info_col.find(u'a')
            link_parts = media_link.get(u'href').split(u'/')
            # of the form /(anime|manga)/10087/Fate/Zero
            if link_parts[1] == u'anime':
              media = self.session.anime(int(link_parts[2])).set({u'title': media_link.text})
            else:
              media = self.session.manga(int(link_parts[2])).set({u'title': media_link.text})
            list_update = {}
            progress_div = info_col.find(u'div', {u'class': u'spaceit_pad'})
            if progress_div:
              progress_match = re.match(r'(?P<status>[A-Za-z]+)(  at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?', progress_div.text).groupdict()
              list_update[u'status'] = progress_match[u'status']
              if progress_match[u'episodes'] is None:
                list_update[u'episodes'] = None
              else:
                list_update[u'episodes'] = int(progress_match[u'episodes'])
              if progress_match[u'total_episodes'] is None:
                list_update[u'total_episodes'] = None
              else:
                list_update[u'total_episodes'] = int(progress_match[u'total_episodes'])
            time_div = info_col.find(u'div', {u'class': u'lightLink'})
            if time_div:
              list_update[u'time'] = utilities.parse_profile_date(time_div.text)
            user_info[u'last_list_updates'][media] = list_update
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    lower_section_headings = user_page.find_all(u'h2')
    # anime stats.
    try:
      anime_stats_header = filter(lambda x: u'Anime Stats' in x.text, lower_section_headings)
      if anime_stats_header:
        anime_stats_header = anime_stats_header[0]
        anime_stats_table = anime_stats_header.findNext(u'table')
        if anime_stats_table:
          user_info[u'anime_stats'] = {}
          for row in anime_stats_table.find_all(u'tr'):
            cols = row.find_all(u'td')
            value = cols[1].text
            if cols[1].find(u'span', {u'title': u'Days'}):
              value = round(float(value), 1)
            else:
              value = int(value)
            user_info[u'anime_stats'][cols[0].text] = value
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      # manga stats.
      manga_stats_header = filter(lambda x: u'Manga Stats' in x.text, lower_section_headings)
      if manga_stats_header:
        manga_stats_header = manga_stats_header[0]
        manga_stats_table = manga_stats_header.findNext(u'table')
        if manga_stats_table:
          user_info[u'manga_stats'] = {}
          for row in manga_stats_table.find_all(u'tr'):
            cols = row.find_all(u'td')
            value = cols[1].text
            if cols[1].find(u'span', {u'title': u'Days'}):
              value = round(float(value), 1)
            else:
              value = int(value)
            user_info[u'manga_stats'][cols[0].text] = value
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      about_header = filter(lambda x: u'About' in x.text, section_headings)
      if about_header:
        about_header = about_header[0]
        user_info[u'about'] = about_header.findNext(u'div').text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return user_info
Example #20
0
    def parse_sidebar(self, manga_page, manga_page_original=None):
        """Parses the DOM and returns manga attributes in the sidebar.

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :rtype: dict
        :return: manga attributes

        :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find('div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find('span', {'itemprop': 'name'})
            if not title_tag:
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id, manga_page, message="Could not find title")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original)

        info_panel_first = manga_page.find('div', {'id': 'content'}).find('table').find('td')

        try:
            volumes_tag = [
                x for x in manga_page.select('span.dark_text') if 'Volumes:' in x.text][0]
            manga_volume = volumes_tag.parent.text.split(':')[1].strip().replace(',', '')
            manga_info[u'volumes'] = (
                int(manga_volume)
                if manga_volume != 'Unknown'
                else None
            )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(text=u'Chapters:').parent.parent
            utilities.extract_tags(chapters_tag.find_all(u'span', {'class': 'dark_text'}))
            chapters_tag_text = chapters_tag.text
            if ':' in chapters_tag_text:
                chapters_tag_text = chapters_tag_text.split(':')[1]
            manga_chapters = chapters_tag_text.strip()
            manga_info[u'chapters'] = (
                int(manga_chapters)
                if chapters_tag.text.strip() != 'Unknown'
                else None
            )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(text=u'Published:').parent.parent
            utilities.extract_tags(published_tag.find_all(u'span', {'class': 'dark_text'}))
            published_parts = published_tag.text.strip().split(u' to ')
            # check if published part only contain start date or also end date.
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(self.id, published_parts[0],
                                                  message="Could not parse single publish date")
                publish_start = published_date
                publish_end = None
            else:
                # two publishing dates.
                try:
                    # publish_start may contain redundant word such as
                    # 'Published: Feb  24, 2003',
                    if 'Published:' in published_parts[0]:
                        published_parts[0] = published_parts[0].split('Published:')[1].strip()
                    publish_start = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0],
                        message="Could not parse first of two publish dates"
                    )
                if published_parts == u'?':
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id,
                            published_parts[1],
                            message="Could not parse second of two publish dates"
                        )

            manga_info[u'published'] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text='Authors:').parent.parent
            utilities.extract_tags(authors_tag.find_all('span', {'class': 'dark_text'}))
            manga_info['authors'] = {}
            for author_link in authors_tag.find_all('a'):
                link_parts = author_link.get('href').split('/')
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set({'name': author_link.text})
                role = author_link.nextSibling.replace(' (', '').replace(')', '')
                manga_info['authors'][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            serialization_tag = info_panel_first.find(text=u'Serialization:').parent.parent
            publication_link = serialization_tag.find('a')
            manga_info[u'serialization'] = None
            if publication_link:
                # of the form /manga.php?mid=1
                link_parts = publication_link.get('href').split('mid=')
                # example for link_parts
                #  ['/manga/magazine/450/Bessatsu_Shounen_Magazine']
                publication_id = link_parts[0].split('/manga/magazine/')[1].split('/')[0]
                manga_info[u'serialization'] = self.session.publication(int(publication_id)).set(
                    {'name': publication_link.text}
                )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info
Example #21
0
    def parse_sidebar(self, manga_page):
        """Parses the DOM and returns manga attributes in the sidebar.

    :type manga_page: :class:`bs4.BeautifulSoup`
    :param manga_page: MAL manga page's DOM

    :rtype: dict
    :return: manga attributes

    :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
    """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find(u"div", {"class": "badresult"})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find(u"div", {"id": "contentWrapper"}).find(u"h1")
            if not title_tag.find(u"div"):
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id, manga_page, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page)

        info_panel_first = manga_page.find(u"div", {"id": "content"}).find(u"table").find(u"td")

        try:
            volumes_tag = info_panel_first.find(text=u"Volumes:").parent.parent
            utilities.extract_tags(volumes_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"volumes"] = int(volumes_tag.text.strip()) if volumes_tag.text.strip() != "Unknown" else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(text=u"Chapters:").parent.parent
            utilities.extract_tags(chapters_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"chapters"] = int(chapters_tag.text.strip()) if chapters_tag.text.strip() != "Unknown" else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(text=u"Published:").parent.parent
            utilities.extract_tags(published_tag.find_all(u"span", {"class": "dark_text"}))
            published_parts = published_tag.text.strip().split(u" to ")
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0], message="Could not parse single publish date"
                    )
                manga_info[u"published"] = (published_date,)
            else:
                # two publishing dates.
                try:
                    publish_start = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0], message="Could not parse first of two publish dates"
                    )
                if published_parts == u"?":
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id, published_parts[1], message="Could not parse second of two publish dates"
                        )
                manga_info[u"published"] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text=u"Authors:").parent.parent
            utilities.extract_tags(authors_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"authors"] = {}
            for author_link in authors_tag.find_all("a"):
                link_parts = author_link.get("href").split("/")
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set({"name": author_link.text})
                role = author_link.nextSibling.replace(" (", "").replace(")", "")
                manga_info[u"authors"][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            serialization_tag = info_panel_first.find(text=u"Serialization:").parent.parent
            publication_link = serialization_tag.find("a")
            manga_info[u"serialization"] = None
            if publication_link:
                link_parts = publication_link.get("href").split("mid=")
                # of the form /manga.php?mid=1
                manga_info[u"serialization"] = self.session.publication(int(link_parts[1])).set(
                    {"name": publication_link.text}
                )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info
Example #22
0
  def parse_sidebar(self, anime_page):
    """Parses the DOM and returns anime attributes in the sidebar.

    :type anime_page: :class:`bs4.BeautifulSoup`
    :param anime_page: MAL anime page's DOM

    :rtype: dict
    :return: anime attributes

    :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
    """
    # if MAL says the series doesn't exist, raise an InvalidAnimeError.
    error_tag = anime_page.find(u'div', {'class': 'badresult'})
    if error_tag:
        raise InvalidAnimeError(self.id)

    title_tag = anime_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1')
    if not title_tag.find(u'span'):
      # otherwise, raise a MalformedAnimePageError.
      raise MalformedAnimePageError(self.id, anime_page, message="Could not find title div")

    anime_info = super(Anime, self).parse_sidebar(anime_page)
    info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td')

    try:
      episode_tag = info_panel_first.find(text=u'Episodes:').parent.parent
      utilities.extract_tags(episode_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'episodes'] = int(episode_tag.text.strip()) if episode_tag.text.strip() != 'Unknown' else 0
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      aired_tag = info_panel_first.find(text=u'Aired:').parent.parent
      utilities.extract_tags(aired_tag.find_all(u'span', {'class': 'dark_text'}))
      aired_parts = aired_tag.text.strip().split(u' to ')
      if len(aired_parts) == 1:
        # this aired once.
        try:
          aired_date = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse single air date")
        anime_info[u'aired'] = (aired_date,)
      else:
        # two airing dates.
        try:
          air_start = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates")
        try:
          air_end = utilities.parse_profile_date(aired_parts[1], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[1], message="Could not parse second of two air dates")
        anime_info[u'aired'] = (air_start, air_end)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      producers_tag = info_panel_first.find(text=u'Producers:').parent.parent
      utilities.extract_tags(producers_tag.find_all(u'span', {'class': 'dark_text'}))
      utilities.extract_tags(producers_tag.find_all(u'sup'))
      anime_info[u'producers'] = []
      for producer_link in producers_tag.find_all('a'):
        if producer_link.text == u'add some':
          # MAL is saying "None found, add some".
          break
        link_parts = producer_link.get('href').split('p=')
        # of the form: /anime.php?p=14
        if len(link_parts) > 1:
          anime_info[u'producers'].append(self.session.producer(int(link_parts[1])).set({'name': producer_link.text}))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      duration_tag = info_panel_first.find(text=u'Duration:').parent.parent
      utilities.extract_tags(duration_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'duration'] = duration_tag.text.strip()
      duration_parts = [part.strip() for part in anime_info[u'duration'].split(u'.')]
      duration_mins = 0
      for part in duration_parts:
        part_match = re.match(u'(?P<num>[0-9]+)', part)
        if not part_match:
          continue
        part_volume = int(part_match.group(u'num'))
        if part.endswith(u'hr'):
          duration_mins += part_volume * 60
        elif part.endswith(u'min'):
          duration_mins += part_volume
      anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      rating_tag = info_panel_first.find(text=u'Rating:').parent.parent
      utilities.extract_tags(rating_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'rating'] = rating_tag.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return anime_info
Example #23
0
    def parse(self, user_page):
        """Parses the DOM and returns user attributes in the main-content area.

    :type user_page: :class:`bs4.BeautifulSoup`
    :param user_page: MAL user page's DOM

    :rtype: dict
    :return: User attributes.

    """
        user_info = self.parse_sidebar(user_page)

        section_headings = user_page.find_all(u'div',
                                              {u'class': u'normal_header'})

        # parse general details.
        # we have to work from the bottom up, since there's broken HTML after every header.
        last_online_elt = user_page.find(u'td', text=u'Last Online')
        if last_online_elt:
            try:
                general_table = last_online_elt.parent.parent
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

            if general_table and general_table.name == u'table':
                try:
                    last_online_elt = general_table.find(u'td',
                                                         text=u'Last Online')
                    if last_online_elt:
                        user_info[
                            u'last_online'] = utilities.parse_profile_date(
                                last_online_elt.findNext(u'td').text)
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    gender = general_table.find(u'td', text=u'Gender')
                    if gender:
                        user_info[u'gender'] = gender.findNext(u'td').text
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    birthday = general_table.find(u'td', text=u'Birthday')
                    if birthday:
                        user_info[u'birthday'] = utilities.parse_profile_date(
                            birthday.findNext(u'td').text)
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    location = general_table.find(u'td', text=u'Location')
                    if location:
                        user_info[u'location'] = location.findNext(u'td').text
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    website = general_table.find(u'td', text=u'Website')
                    if website:
                        user_info[u'website'] = website.findNext(u'td').text
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    join_date = general_table.find(u'td', text=u'Join Date')
                    if join_date:
                        user_info[u'join_date'] = utilities.parse_profile_date(
                            join_date.findNext(u'td').text)
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    access_rank = general_table.find(u'td',
                                                     text=u'Access Rank')
                    if access_rank:
                        user_info[u'access_rank'] = access_rank.findNext(
                            u'td').text
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    anime_list_views = general_table.find(
                        u'td', text=u'Anime List Views')
                    if anime_list_views:
                        user_info[u'anime_list_views'] = int(
                            anime_list_views.findNext(u'td').text.replace(
                                ',', ''))
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    manga_list_views = general_table.find(
                        u'td', text=u'Manga List Views')
                    if manga_list_views:
                        user_info[u'manga_list_views'] = int(
                            manga_list_views.findNext(u'td').text.replace(
                                ',', ''))
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    num_comments = general_table.find(u'td', text=u'Comments')
                    if num_comments:
                        user_info[u'num_comments'] = int(
                            num_comments.findNext(u'td').text.replace(',', ''))
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

                try:
                    num_forum_posts = general_table.find(u'td',
                                                         text=u'Forum Posts')
                    if num_forum_posts:
                        user_info[u'num_forum_posts'] = int(
                            num_forum_posts.findNext(u'td').text.replace(
                                " (Find All)", "").replace(',', ''))
                except:
                    if not self.session.suppress_parse_exceptions:
                        raise

        try:
            # last list updates.
            list_updates_header = filter(
                lambda x: u'Last List Updates' in x.text, section_headings)
            if list_updates_header:
                list_updates_header = list_updates_header[0]
                list_updates_table = list_updates_header.findNext(u'table')
                if list_updates_table:
                    user_info[u'last_list_updates'] = {}
                    for row in list_updates_table.find_all(u'tr'):
                        cols = row.find_all(u'td')
                        info_col = cols[1]
                        media_link = info_col.find(u'a')
                        link_parts = media_link.get(u'href').split(u'/')
                        # of the form /(anime|manga)/10087/Fate/Zero
                        if link_parts[1] == u'anime':
                            media = self.session.anime(int(link_parts[2])).set(
                                {u'title': media_link.text})
                        else:
                            media = self.session.manga(int(link_parts[2])).set(
                                {u'title': media_link.text})
                        list_update = {}
                        progress_div = info_col.find(
                            u'div', {u'class': u'spaceit_pad'})
                        if progress_div:
                            progress_match = re.match(
                                r'(?P<status>[A-Za-z]+)(  at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?',
                                progress_div.text).groupdict()
                            list_update[u'status'] = progress_match[u'status']
                            if progress_match[u'episodes'] is None:
                                list_update[u'episodes'] = None
                            else:
                                list_update[u'episodes'] = int(
                                    progress_match[u'episodes'])
                            if progress_match[u'total_episodes'] is None:
                                list_update[u'total_episodes'] = None
                            else:
                                list_update[u'total_episodes'] = int(
                                    progress_match[u'total_episodes'])
                        time_div = info_col.find(u'div',
                                                 {u'class': u'lightLink'})
                        if time_div:
                            list_update[
                                u'time'] = utilities.parse_profile_date(
                                    time_div.text)
                        user_info[u'last_list_updates'][media] = list_update
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        lower_section_headings = user_page.find_all(u'h2')
        # anime stats.
        try:
            anime_stats_header = filter(lambda x: u'Anime Stats' in x.text,
                                        lower_section_headings)
            if anime_stats_header:
                anime_stats_header = anime_stats_header[0]
                anime_stats_table = anime_stats_header.findNext(u'table')
                if anime_stats_table:
                    user_info[u'anime_stats'] = {}
                    for row in anime_stats_table.find_all(u'tr'):
                        cols = row.find_all(u'td')
                        value = cols[1].text
                        if cols[1].find(u'span', {u'title': u'Days'}):
                            value = round(float(value), 1)
                        else:
                            value = int(value)
                        user_info[u'anime_stats'][cols[0].text] = value
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # manga stats.
            manga_stats_header = filter(lambda x: u'Manga Stats' in x.text,
                                        lower_section_headings)
            if manga_stats_header:
                manga_stats_header = manga_stats_header[0]
                manga_stats_table = manga_stats_header.findNext(u'table')
                if manga_stats_table:
                    user_info[u'manga_stats'] = {}
                    for row in manga_stats_table.find_all(u'tr'):
                        cols = row.find_all(u'td')
                        value = cols[1].text
                        if cols[1].find(u'span', {u'title': u'Days'}):
                            value = round(float(value), 1)
                        else:
                            value = int(value)
                        user_info[u'manga_stats'][cols[0].text] = value
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            about_header = filter(lambda x: u'About' in x.text,
                                  section_headings)
            if about_header:
                about_header = about_header[0]
                user_info[u'about'] = about_header.findNext(
                    u'div').text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #24
0
    def parse_sidebar(self, anime_page, anime_page_original=None):
        """Parses the DOM and returns anime attributes in the sidebar.

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM uncleaned

        :rtype: dict
        :return: anime attributes

        :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidAnimeError.
        error_tag = anime_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidAnimeError(self.id)

        title_tag = anime_page.find(u'div', {
            'id': 'contentWrapper'
        }).find(u'h1')
        if not title_tag.find(u'div'):
            # otherwise, raise a MalformedAnimePageError.
            try:
                title_tag = anime_page.select('h1.h1 span')[0].text
            except IndexError:
                raise MalformedAnimePageError(
                    self.id, None, message="Could not find title div")

        anime_info = super(Anime, self).parse_sidebar(anime_page,
                                                      anime_page_original)
        info_panel_first = anime_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            episode_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Episodes:' in x.text
            ][0].parent
            anime_info[u'episodes'] = int(
                episode_tag.text.split(':')
                [-1].strip()) if episode_tag.text.strip() != 'Unknown' else 0
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Aired:' in x.text
            ][0].parent
            aired_tag_text = aired_tag.text.split(':')[1]
            aired_parts = aired_tag_text.strip().split(u' to ')
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse single air date")
                anime_info[u'aired'] = (aired_date, )
            else:
                # two airing dates.
                try:
                    air_start = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse first of two air dates")
                try:
                    air_end = utilities.parse_profile_date(
                        aired_parts[1],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[1],
                        message="Could not parse second of two air dates")
                anime_info[u'aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise
        try:
            anime_info[u'producers'] = self.parse_producers(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Duration:' in x.text
            ][0].parent
            anime_info[u'duration'] = duration_tag.text.split(':')[1].strip()
            duration_parts = [
                part.strip() for part in anime_info[u'duration'].split(u'.')
            ]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match(u'(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group(u'num'))
                if part.endswith(u'hr'):
                    duration_mins += part_volume * 60
                elif part.endswith(u'min'):
                    duration_mins += part_volume
            anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Rating:' in x.text
            ][0].parent
            utilities.extract_tags(
                rating_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
Example #25
0
    def parse(self, user_page):
        """Parses the DOM and returns user attributes in the main-content area.

    :type user_page: :class:`bs4.BeautifulSoup`
    :param user_page: MAL user page's DOM

    :rtype: dict
    :return: User attributes.

    """
        user_info = self.parse_sidebar(user_page)

        section_headings = user_page.find_all(u'div',
                                              {u'class': u'normal_header'})

        # parse general details.
        try:
            num_comments_tag = user_page.find(u'a',
                                              text=re.compile(u'All Comments'))
            num_comments = re.search(u'\((\d+)\)',
                                     num_comments_tag.text).group(1)
            user_info[u'num_comments'] = int(num_comments)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # parse favorites
        favorites_tag = user_page.find(u'div', {u'class': u'user-favorites'})
        if favorites_tag:
            favorites_section = favorites_tag.find_all(u'div', recursive=False)

            try:
                favorite_anime_header = favorites_section[0]
                user_info[u'favorite_anime'] = []
                for elt in favorite_anime_header.find_all(u'li'):
                    link_tag = elt.find_all(u'a')[1]
                    link_parts = link_tag.get(u'href').split(u'.net')[1].split(
                        u'/')
                    # of the form /anime/467/Ghost_in_the_Shell:_Stand_Alone_Complex
                    user_info[u'favorite_anime'].append(
                        self.session.anime(int(link_parts[2])).set(
                            {u'title': link_tag.text}))
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

            try:
                favorite_manga_header = favorites_section[1]
                user_info[u'favorite_manga'] = []
                for elt in favorite_manga_header.find_all(u'li'):
                    link_tag = elt.find_all(u'a')[1]
                    link_parts = link_tag.get(u'href').split(u'.net')[1].split(
                        u'/')
                    # of the form /manga/467/Ghost_in_the_Shell:_Stand_Alone_Complex
                    user_info[u'favorite_manga'].append(
                        self.session.manga(int(link_parts[2])).set(
                            {u'title': link_tag.text}))
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

            try:
                favorite_character_header = favorites_section[2]
                user_info[u'favorite_characters'] = {}
                for elt in favorite_character_header.find_all(u'li'):
                    link_tag = elt.find_all(u'a')[1]
                    link_parts = link_tag.get(u'href').split(u'.net')[1].split(
                        u'/')
                    # of the form /character/467/Ghost_in_the_Shell:_Stand_Alone_Complex
                    char = self.session.character(int(link_parts[2])).set(
                        {u'title': link_tag.text})
                    media_link_tag = link_tag.nextSibling.find(u'a')
                    media_link_parts = media_link_tag.get(u'href').split(u'/')
                    # of the form /anime|manga/467/Ghost_in_the_Shell:_Stand_Alone_Complex
                    anime = getattr(self.session, media_link_parts[1])(int(
                        media_link_parts[2])).set(
                            {u'title': media_link_tag.text})
                    user_info[u'favorite_characters'][char] = anime
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

            try:
                favorite_people_header = favorites_section[3]
                user_info[u'favorite_people'] = []
                for elt in favorite_people_header.find_all(u'li'):
                    link_tag = elt.find_all(u'a')[1]
                    link_parts = link_tag.get(u'href').split(u'.net')[1].split(
                        u'/')
                    # of the form /people/467/Ghost_in_the_Shell:_Stand_Alone_Complex
                    user_info[u'favorite_people'].append(
                        self.session.person(int(link_parts[2])).set(
                            {u'title': link_tag.text}))
            except:
                if not self.session.suppress_parse_exceptions:
                    raise

        stats_tag = user_page.find(id='statistics')

        try:
            # last list updates.
            list_updates_header = filter(
                lambda x: u'Last List Updates' in x.text, section_headings)
            if list_updates_header:
                list_updates_header = list_updates_header[0]
                list_updates_table = list_updates_header.findNext(u'table')
                if list_updates_table:
                    user_info[u'last_list_updates'] = {}
                    for row in list_updates_table.find_all(u'tr'):
                        cols = row.find_all(u'td')
                        info_col = cols[1]
                        media_link = info_col.find(u'a')
                        link_parts = media_link.get(u'href').split(u'/')
                        # of the form /(anime|manga)/10087/Fate/Zero
                        if link_parts[1] == u'anime':
                            media = self.session.anime(int(link_parts[2])).set(
                                {u'title': media_link.text})
                        else:
                            media = self.session.manga(int(link_parts[2])).set(
                                {u'title': media_link.text})
                        list_update = {}
                        progress_div = info_col.find(
                            u'div', {u'class': u'spaceit_pad'})
                        if progress_div:
                            progress_match = re.match(
                                r'(?P<status>[A-Za-z]+)(  at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?',
                                progress_div.text).groupdict()
                            list_update[u'status'] = progress_match[u'status']
                            if progress_match[u'episodes'] is None:
                                list_update[u'episodes'] = None
                            else:
                                list_update[u'episodes'] = int(
                                    progress_match[u'episodes'])
                            if progress_match[u'total_episodes'] is None:
                                list_update[u'total_episodes'] = None
                            else:
                                list_update[u'total_episodes'] = int(
                                    progress_match[u'total_episodes'])
                        time_div = info_col.find(u'div',
                                                 {u'class': u'lightLink'})
                        if time_div:
                            list_update[
                                u'time'] = utilities.parse_profile_date(
                                    time_div.text)
                        user_info[u'last_list_updates'][media] = list_update
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # anime stats.
        try:
            anime_stats_header = stats_tag.find(u'div',
                                                {u'class': u'stats anime'})
            stats = user_info['anime_stats'] = {}
            stats['Days'] = float(
                anime_stats_header.find(
                    text=re.compile('Days')).parent.nextSibling)
            stats['Mean Score'] = float(
                anime_stats_header.find(
                    text=re.compile('Mean Score')).parent.nextSibling)
            stats_tables = anime_stats_header.find_all(u'ul')
            # watching, completed, etc
            for metric in stats_tables[0].find_all(u'li'):
                stats[metric.find(u'a').text] = int(
                    metric.find(u'span').text.replace(',', ''))
            # total entries, rewatched, etc
            for metric in stats_tables[1].find_all(u'li'):
                parts = metric.find_all(u'span')
                stats[parts[0].text] = int(parts[1].text.replace(',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # manga stats.
        try:
            manga_stats_header = stats_tag.find(u'div',
                                                {u'class': u'stats manga'})
            stats = user_info['manga_stats'] = {}
            stats['Days'] = float(
                manga_stats_header.find(
                    text=re.compile('Days')).parent.nextSibling)
            stats['Mean Score'] = float(
                manga_stats_header.find(
                    text=re.compile('Mean Score')).parent.nextSibling)
            stats_tables = manga_stats_header.find_all(u'ul')
            # reading, completed, etc
            for metric in stats_tables[0].find_all(u'li'):
                stats[metric.find(u'a').text] = int(
                    metric.find(u'span').text.replace(',', ''))
            # total entries, reread, etc
            for metric in stats_tables[1].find_all(u'li'):
                parts = metric.find_all(u'span')
                stats[parts[0].text] = int(parts[1].text.replace(',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            about_header = user_page.find(u'div',
                                          {u'class': u'profile-about-user'})
            if not about_header:
                user_info[u'about'] = u''
            else:
                user_info[u'about'] = about_header.find(u'div').text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #26
0
    def parse_reviews(self, reviews_page):
        """Parse the DOM and returns user reviews attributes.

        :type reviews_page: :class:`bs4.BeautifulSoup`
        :param reviews_page: MAL user reviews page's DOM

        :rtype: dict
        :return: User reviews attributes.

        """
        user_info = self.parse_sidebar(reviews_page)
        second_col = (
            reviews_page
            .find(u'div', {u'id': u'content'})
            .find(u'table')
            .find(u'tr')
            .find_all(u'td', recursive=False)[ 1]
        )

        try:
            user_info['reviews'] = {}
            reviews = second_col.find_all('div', {'class': 'borderDark'}, recursive=False)
            if reviews:
                for row in reviews:
                    review_info = {}
                    try:
                        (meta_elt, review_elt) = row.find_all('div', recursive=False)[0:2]
                    except ValueError:
                        raise
                    meta_rows = meta_elt.find_all(u'div', recursive=False)
                    date_txt = meta_rows[0].find(u'div').text
                    review_info[u'date'] = utilities.parse_profile_date(date_txt)
                    media_link = meta_rows[0].find(u'a')
                    link_parts = media_link.get(u'href').split(u'/')
                    # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo
                    media_id = int(link_parts[2])
                    media_type = link_parts[1]
                    media = getattr(
                        self.session,
                        media_type
                    )(media_id).set({u'title': media_link.text})

                    helpfuls = meta_rows[1].find('span', recursive=False)
                    try:
                        hm_reg = r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)'
                        helpful_match = re.match(hm_reg, helpfuls.text).groupdict()
                        review_info[u'people_helped'] = int(helpful_match[u'people_helped'])
                        review_info[u'people_total'] = int(helpful_match[u'people_total'])
                    except AttributeError:
                        # total of people is no longer shown
                        # try another method, not using regex method.
                        # ie: 805 people found this review helpful
                        helpful_match = helpfuls.text.split('people found this review helpful')[0]
                        review_info['people_helped'] = int(helpful_match)
                        # review_info[u'people_total'] = int(helpful_match[u'people_total'])
                        review_info['people_total'] = None

                    try:
                        cm_reg = r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)'
                        consumption_match = re.match(cm_reg, meta_rows[2].text).groupdict()
                        review_info[u'media_consumed'] = int(consumption_match[u'media_consumed'])
                        if consumption_match[u'media_total'] == u'?':
                            review_info[u'media_total'] = None
                        else:
                            review_info['media_total'] = int(consumption_match['media_total'])
                    except AttributeError:
                        # available format
                        # ie anime: 25 of 25 episodes seen
                        # ie : 25 of ? episodes seen
                        # ie : ? episodes
                        # ie manga: 40 chapters
                        # ie : 60 of ? chapters read
                        # ie : ? chapters
                        # <div class="lightLink" style="float: right;">24 of 24 episodes seen</div>

                        media_tag = meta_rows[1].find_all('div')[0]
                        if ' episodes' in media_tag.text:
                            user_media_consumption = media_tag.text.split(' episodes')[0].strip()
                        elif ' chapters' in media_tag.text:
                            user_media_consumption = media_tag.text.split(' chapters')[0].strip()
                        else:
                            # no format recognized
                            raise AttributeError
                        # user_media_consumption : 'xx of xx', 'xx of ?', '? of xx', or '?'
                        if 'of' not in user_media_consumption:
                            review_info['media_consumed'] = None
                            review_info['media_total'] = None
                        else:
                            # temp var for variable media_consumed
                            temp_consumed = user_media_consumption.split('of')[0].strip()
                            # temp var for variable media_total
                            temp_total = user_media_consumption.split('of')[1].strip()
                            if temp_consumed == '?':
                                review_info['media_consumed'] = None
                            else:
                                review_info['media_consumed'] = int(temp_consumed)
                            if temp_total == '?':
                                review_info['media_total'] = None
                            else:
                                review_info['media_total'] = int(temp_total)

                    rating_txt = meta_rows[2].text.replace(u'Overall Rating: ', '')
                    rating_txt = rating_txt.split('Other review')[0]
                    review_info[u'rating'] = int(rating_txt)

                    for x in review_elt.find_all(['div', 'a']):
                        x.extract()

                    try:
                        review_info[u'text'] = review_elt.text.strip()
                    except AttributeError:
                        # sometime reviw_elt cant produce attribute error
                        # one of the solution is to reparse the tag
                        review_info[u'text'] = BeautifulSoup(str(review_elt), "lxml").text.strip()

                    user_info['reviews'][media] = review_info
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
Example #27
0
    def parse_sidebar(self, anime_page, anime_page_original=None):
        """Parse the DOM and returns anime attributes in the sidebar.

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM uncleaned

        :rtype: dict
        :return: anime attributes

        :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidAnimeError.
        error_tag = anime_page.find('div', {'class': 'badresult'})
        if error_tag:
            raise InvalidAnimeError(self.id)

        title_tag = anime_page.find('div', {'id': 'contentWrapper'}).find('h1')
        if not title_tag.find('div'):
            # otherwise, raise a MalformedAnimePageError.
            try:
                title_tag = anime_page.select('h1.h1 span')[0].text
            except IndexError:
                raise MalformedAnimePageError(self.id, None, message="Could not find title div")

        anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original)
        # 'info_panel_first'not used
        # info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td')

        try:
            anime_info['episodes'] = self._parse_episodes(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = [x for x in anime_page_original.find_all('span')
                         if 'Aired:' in x.text][0].parent
            aired_tag_text = aired_tag.text.split(':')[1]
            aired_parts = aired_tag_text.strip().split(' to ')
            suppress_parse_exceptions = self.session.suppress_parse_exceptions
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = parse_profile_date(aired_parts[0],
                                                    suppress=suppress_parse_exceptions)
                except ValueError:
                    err_msg = "Could not parse single air date"
                    raise MalformedAnimePageError(self.id, aired_parts[0], message=err_msg)
                anime_info['aired'] = (aired_date,)
            else:
                # two airing dates.
                try:
                    air_start = parse_profile_date(aired_parts[0],
                                                   suppress=suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(self.id, aired_parts[0],
                                                  message="Could not parse first of two air dates")
                try:
                    air_end = parse_profile_date(aired_parts[1],
                                                 suppress=suppress_parse_exceptions)
                except ValueError:
                    error_msg = "Could not parse second of two air dates"
                    raise MalformedAnimePageError(self.id, aired_parts[1],
                                                  message=error_msg)
                anime_info['aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise
        try:
            anime_info['producers'] = self.parse_producers(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = [x for x in anime_page_original.find_all('span')
                            if 'Duration:' in x.text][0].parent
            anime_info['duration'] = duration_tag.text.split(':')[1].strip()
            duration_parts = [part.strip() for part in anime_info['duration'].split('.')]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match('(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group('num'))
                if part.endswith('hr'):
                    duration_mins += part_volume * 60
                elif part.endswith('min'):
                    duration_mins += part_volume
            anime_info['duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = [x for x in anime_page_original.find_all('span')
                          if 'Rating:' in x.text][0].parent
            utilities.extract_tags(rating_tag.find_all('span', {'class': 'dark_text'}))
            anime_info['rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
Example #28
0
    def parse_sidebar(self, manga_page, manga_page_original=None):
        """Parses the DOM and returns manga attributes in the sidebar.

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :rtype: dict
        :return: manga attributes

        :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find(u'span', {'itemprop': 'name'})
            if not title_tag:
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id,
                                              manga_page,
                                              message="Could not find title")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page,
                                                      manga_page_original)

        info_panel_first = manga_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            volumes_tag = info_panel_first.find(text=u'Volumes:').parent.parent
            utilities.extract_tags(
                volumes_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'volumes'] = int(volumes_tag.text.strip(
            )) if volumes_tag.text.strip() != 'Unknown' else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(
                text=u'Chapters:').parent.parent
            utilities.extract_tags(
                chapters_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'chapters'] = int(chapters_tag.text.strip(
            )) if chapters_tag.text.strip() != 'Unknown' else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(
                text=u'Published:').parent.parent
            utilities.extract_tags(
                published_tag.find_all(u'span', {'class': 'dark_text'}))
            published_parts = published_tag.text.strip().split(u' to ')
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(
                        published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id,
                        published_parts[0],
                        message="Could not parse single publish date")
                manga_info[u'published'] = (published_date, )
            else:
                # two publishing dates.
                try:
                    publish_start = utilities.parse_profile_date(
                        published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id,
                        published_parts[0],
                        message="Could not parse first of two publish dates")
                if published_parts == u'?':
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(
                            published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id,
                            published_parts[1],
                            message=
                            "Could not parse second of two publish dates")
                manga_info[u'published'] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text=u'Authors:').parent.parent
            utilities.extract_tags(
                authors_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'authors'] = {}
            for author_link in authors_tag.find_all('a'):
                link_parts = author_link.get('href').split('/')
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set(
                    {'name': author_link.text})
                role = author_link.nextSibling.replace(' (',
                                                       '').replace(')', '')
                manga_info[u'authors'][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            manga_info[u'serialization'] = self.parse_serialization(manga_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info