def __init__(self, ignore_errors = True): HTMLPageParser.__init__(self) self.__ignore_errors = ignore_errors
def parse(self, html): '''Parses the specified HTML.''' try: self.__data = {} self.__private_data = {} try: HTMLPageParser.parse(self, html) except _StopParsing: pass if 'user_name' not in self.__data: raise ParseError('Unable to find the user name.') if 'posts' not in self.__data: raise ParseError('Unable to find the wall.') if not self.__data['posts'] and not self.__private_data.get('wall_is_empty'): raise ParseError('Unable to find wall posts.') for post in self.__data['posts']: if 'title' not in post: LOG.error('Unable to find a title for post %s.', post['url']) post['title'] = self.__data['user_name'] return self.__data except ParseError: # Try to understand why we haven't found the wall on the page class_attr_regex_template = r''' \s+class=(?: {name} | '(?:[^']*\s+)?{name}(?:\s+[^']*)?' | "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?" ) ''' # It may be a private group if re.search(r''' <h1''' + self.tag_attrs_regex + r''' \s+id=(?:title|'title'|"title")''' + self.tag_attrs_regex + ur''' \s*> \s*Закрытая\s+группа ''', html, re.IGNORECASE | re.VERBOSE): raise PrivateGroupError() # User's profile may be deleted if re.search(r''' <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name = 'profile_deleted') + self.tag_attrs_regex + r''' \s*> ''', html, re.IGNORECASE | re.VERBOSE): raise ProfileNotAvailableError() # The server is on maintenance or returned a user friendly error --> match = re.search(r''' <title''' + self.tag_attrs_regex + ur'''\s*> \s*Ошибка\s* </title> .* <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name = 'body') + self.tag_attrs_regex + r''' \s*> (.*?) </?div ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE) if match: raise ServerError( re.sub('<[^>]*>', '', match.group(1)).replace('<', '').replace('>', '').strip()) # The server is on maintenance or returned a user friendly error <-- # Other errors raise