Example #1
0
 def __init__(self, ignore_errors = True):
     HTMLPageParser.__init__(self)
     self.__ignore_errors = ignore_errors
Example #2
0
    def parse(self, html):
        '''Parses the specified HTML.'''

        try:
            self.__data = {}
            self.__private_data = {}

            try:
                HTMLPageParser.parse(self, html)
            except _StopParsing:
                pass


            if 'user_name' not in self.__data:
                raise ParseError('Unable to find the user name.')

            if 'posts' not in self.__data:
                raise ParseError('Unable to find the wall.')

            if not self.__data['posts'] and not self.__private_data.get('wall_is_empty'):
                raise ParseError('Unable to find wall posts.')


            for post in self.__data['posts']:
                if 'title' not in post:
                    LOG.error('Unable to find a title for post %s.', post['url'])
                    post['title'] = self.__data['user_name']


            return self.__data
        except ParseError:
            # Try to understand why we haven't found the wall on the page

            class_attr_regex_template = r'''
                \s+class=(?:
                    {name}
                    |
                    '(?:[^']*\s+)?{name}(?:\s+[^']*)?'
                    |
                    "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?"
                )
            '''

            # It may be a private group
            if re.search(r'''
                <h1''' +
                    self.tag_attrs_regex + r'''
                    \s+id=(?:title|'title'|"title")''' +
                    self.tag_attrs_regex + ur'''
                \s*>
                    \s*Закрытая\s+группа
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise PrivateGroupError()

            # User's profile may be deleted
            if re.search(r'''
                <div''' +
                    self.tag_attrs_regex +
                    class_attr_regex_template.format(name = 'profile_deleted') +
                    self.tag_attrs_regex + r'''
                \s*>
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise ProfileNotAvailableError()



            # The server is on maintenance or returned a user friendly error -->
            match = re.search(r'''
                <title''' + self.tag_attrs_regex + ur'''\s*>
                    \s*Ошибка\s*
                </title>
                .*
                <div''' +
                    self.tag_attrs_regex +
                    class_attr_regex_template.format(name = 'body') +
                    self.tag_attrs_regex + r'''
                \s*>
                    (.*?)
                </?div
            ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE)

            if match:
                raise ServerError(
                    re.sub('<[^>]*>', '', match.group(1)).replace('<', '').replace('>', '').strip())
            # The server is on maintenance or returned a user friendly error <--


            # Other errors
            raise