def test(self): '''Testing HTMLPageParser.''' page_dir = 'html_parser' for page_name in os.listdir(page_dir): page_path = os.path.join(page_dir, page_name) print 'Testing "{0}"...'.format(page_path) HTMLPageParser().parse(open(page_path).read().decode('utf-8'))
def parse(self, html): """Parses the specified HTML.""" try: self.__data = {} self.__private_data = {} try: HTMLPageParser.parse(self, html) except _StopParsing: pass if "user_name" not in self.__data: raise ParseError("Unable to find the user name.") if "posts" not in self.__data: raise ParseError("Unable to find the wall.") if not self.__data["posts"] and not self.__private_data.get("wall_is_empty"): raise ParseError("Unable to find wall posts.") for post in self.__data["posts"]: if "title" not in post: LOG.error("Unable to find a title for post %s.", post["url"]) post["title"] = self.__data["user_name"] return self.__data except ParseError: # Try to understand why we haven't found the wall on the page class_attr_regex_template = r""" \s+class=(?: {name} | '(?:[^']*\s+)?{name}(?:\s+[^']*)?' | "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?" ) """ # It may be a private group if re.search( r""" <h1""" + self.tag_attrs_regex + r""" \s+id=(?:title|'title'|"title")""" + self.tag_attrs_regex + r""" \s*> \s*Закрытая\s+группа """, html, re.IGNORECASE | re.VERBOSE, ): raise PrivateGroupError() # User's profile may be deleted if re.search( r""" <div""" + self.tag_attrs_regex + class_attr_regex_template.format(name="profile_deleted") + self.tag_attrs_regex + r""" \s*> """, html, re.IGNORECASE | re.VERBOSE, ): raise ProfileNotAvailableError() # The server is on maintenance or returned a user friendly error --> match = re.search( r""" <title""" + self.tag_attrs_regex + r"""\s*> \s*Ошибка\s* </title> .* <div""" + self.tag_attrs_regex + class_attr_regex_template.format(name="body") + self.tag_attrs_regex + r""" \s*> (.*?) </?div """, html, re.VERBOSE | re.DOTALL | re.IGNORECASE, ) if match: raise ServerError(re.sub("<[^>]*>", "", match.group(1)).replace("<", "").replace(">", "").strip()) # The server is on maintenance or returned a user friendly error <-- # Other errors raise
def __init__(self, ignore_errors=True): HTMLPageParser.__init__(self) self.__ignore_errors = ignore_errors
def parse(self, html): '''Parses the specified HTML.''' try: self.__data = {} self.__private_data = {} try: HTMLPageParser.parse(self, html) except _StopParsing: pass if 'user_name' not in self.__data: raise ParseError('Unable to find the user name.') if 'posts' not in self.__data: raise ParseError('Unable to find the wall.') if not self.__data['posts'] and not self.__private_data.get('wall_is_empty'): raise ParseError('Unable to find wall posts.') for post in self.__data['posts']: if 'title' not in post: LOG.error('Unable to find a title for post %s.', post['url']) post['title'] = self.__data['user_name'] return self.__data except ParseError: # Try to understand why we haven't found the wall on the page class_attr_regex_template = r''' \s+class=(?: {name} | '(?:[^']*\s+)?{name}(?:\s+[^']*)?' | "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?" ) ''' # It may be a private group if re.search(r''' <h1''' + self.tag_attrs_regex + r''' \s+id=(?:title|'title'|"title")''' + self.tag_attrs_regex + ur''' \s*> \s*Закрытая\s+группа ''', html, re.IGNORECASE | re.VERBOSE): raise PrivateGroupError() # User's profile may be deleted if re.search(r''' <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name = 'profile_deleted') + self.tag_attrs_regex + r''' \s*> ''', html, re.IGNORECASE | re.VERBOSE): raise ProfileNotAvailableError() # The server is on maintenance or returned a user friendly error --> match = re.search(r''' <title''' + self.tag_attrs_regex + ur'''\s*> \s*Ошибка\s* </title> .* <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name = 'body') + self.tag_attrs_regex + r''' \s*> (.*?) </?div ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE) if match: raise ServerError( re.sub('<[^>]*>', '', match.group(1)).replace('<', '').replace('>', '').strip()) # The server is on maintenance or returned a user friendly error <-- # Other errors raise
def parse(self, html): '''Parses the specified HTML.''' try: self.__data = {} self.__private_data = {} try: HTMLPageParser.parse(self, html) except _StopParsing: pass if 'user_name' not in self.__data: raise ParseError('Unable to find the user name.') if 'posts' not in self.__data: raise ParseError('Unable to find the wall.') if not self.__data['posts'] and not self.__private_data.get( 'wall_is_empty'): raise ParseError('Unable to find wall posts.') for post in self.__data['posts']: if 'title' not in post: LOG.error('Unable to find a title for post %s.', post['url']) post['title'] = self.__data['user_name'] return self.__data except ParseError: # Try to understand why we haven't found the wall on the page class_attr_regex_template = r''' \s+class=(?: {name} | '(?:[^']*\s+)?{name}(?:\s+[^']*)?' | "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?" ) ''' # It may be a private group if re.search( r''' <h1''' + self.tag_attrs_regex + r''' \s+id=(?:title|'title'|"title")''' + self.tag_attrs_regex + ur''' \s*> \s*Закрытая\s+группа ''', html, re.IGNORECASE | re.VERBOSE): raise PrivateGroupError() # User's profile may be deleted if re.search( r''' <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name='profile_deleted') + self.tag_attrs_regex + r''' \s*> ''', html, re.IGNORECASE | re.VERBOSE): raise ProfileNotAvailableError() # The server is on maintenance or returned a user friendly error --> match = re.search( r''' <title''' + self.tag_attrs_regex + ur'''\s*> \s*Ошибка\s* </title> .* <div''' + self.tag_attrs_regex + class_attr_regex_template.format(name='body') + self.tag_attrs_regex + r''' \s*> (.*?) </?div ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE) if match: raise ServerError( re.sub('<[^>]*>', '', match.group(1)).replace('<', '').replace('>', '').strip()) # The server is on maintenance or returned a user friendly error <-- # Other errors raise