Beispiel #1
0
    def test(self):
        '''Testing HTMLPageParser.'''

        page_dir = 'html_parser'

        for page_name in os.listdir(page_dir):
            page_path = os.path.join(page_dir, page_name)
            print 'Testing "{0}"...'.format(page_path)
            HTMLPageParser().parse(open(page_path).read().decode('utf-8'))
Beispiel #2
0
    def parse(self, html):
        """Parses the specified HTML."""

        try:
            self.__data = {}
            self.__private_data = {}

            try:
                HTMLPageParser.parse(self, html)
            except _StopParsing:
                pass

            if "user_name" not in self.__data:
                raise ParseError("Unable to find the user name.")

            if "posts" not in self.__data:
                raise ParseError("Unable to find the wall.")

            if not self.__data["posts"] and not self.__private_data.get("wall_is_empty"):
                raise ParseError("Unable to find wall posts.")

            for post in self.__data["posts"]:
                if "title" not in post:
                    LOG.error("Unable to find a title for post %s.", post["url"])
                    post["title"] = self.__data["user_name"]

            return self.__data
        except ParseError:
            # Try to understand why we haven't found the wall on the page

            class_attr_regex_template = r"""
                \s+class=(?:
                    {name}
                    |
                    '(?:[^']*\s+)?{name}(?:\s+[^']*)?'
                    |
                    "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?"
                )
            """

            # It may be a private group
            if re.search(
                r"""
                <h1"""
                + self.tag_attrs_regex
                + r"""
                    \s+id=(?:title|'title'|"title")"""
                + self.tag_attrs_regex
                + r"""
                \s*>
                    \s*Закрытая\s+группа
            """,
                html,
                re.IGNORECASE | re.VERBOSE,
            ):
                raise PrivateGroupError()

            # User's profile may be deleted
            if re.search(
                r"""
                <div"""
                + self.tag_attrs_regex
                + class_attr_regex_template.format(name="profile_deleted")
                + self.tag_attrs_regex
                + r"""
                \s*>
            """,
                html,
                re.IGNORECASE | re.VERBOSE,
            ):
                raise ProfileNotAvailableError()

            # The server is on maintenance or returned a user friendly error -->
            match = re.search(
                r"""
                <title"""
                + self.tag_attrs_regex
                + r"""\s*>
                    \s*Ошибка\s*
                </title>
                .*
                <div"""
                + self.tag_attrs_regex
                + class_attr_regex_template.format(name="body")
                + self.tag_attrs_regex
                + r"""
                \s*>
                    (.*?)
                </?div
            """,
                html,
                re.VERBOSE | re.DOTALL | re.IGNORECASE,
            )

            if match:
                raise ServerError(re.sub("<[^>]*>", "", match.group(1)).replace("<", "").replace(">", "").strip())
            # The server is on maintenance or returned a user friendly error <--

            # Other errors
            raise
Beispiel #3
0
 def __init__(self, ignore_errors=True):
     HTMLPageParser.__init__(self)
     self.__ignore_errors = ignore_errors
Beispiel #4
0
    def parse(self, html):
        '''Parses the specified HTML.'''

        try:
            self.__data = {}
            self.__private_data = {}

            try:
                HTMLPageParser.parse(self, html)
            except _StopParsing:
                pass


            if 'user_name' not in self.__data:
                raise ParseError('Unable to find the user name.')

            if 'posts' not in self.__data:
                raise ParseError('Unable to find the wall.')

            if not self.__data['posts'] and not self.__private_data.get('wall_is_empty'):
                raise ParseError('Unable to find wall posts.')


            for post in self.__data['posts']:
                if 'title' not in post:
                    LOG.error('Unable to find a title for post %s.', post['url'])
                    post['title'] = self.__data['user_name']


            return self.__data
        except ParseError:
            # Try to understand why we haven't found the wall on the page

            class_attr_regex_template = r'''
                \s+class=(?:
                    {name}
                    |
                    '(?:[^']*\s+)?{name}(?:\s+[^']*)?'
                    |
                    "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?"
                )
            '''

            # It may be a private group
            if re.search(r'''
                <h1''' +
                    self.tag_attrs_regex + r'''
                    \s+id=(?:title|'title'|"title")''' +
                    self.tag_attrs_regex + ur'''
                \s*>
                    \s*Закрытая\s+группа
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise PrivateGroupError()

            # User's profile may be deleted
            if re.search(r'''
                <div''' +
                    self.tag_attrs_regex +
                    class_attr_regex_template.format(name = 'profile_deleted') +
                    self.tag_attrs_regex + r'''
                \s*>
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise ProfileNotAvailableError()



            # The server is on maintenance or returned a user friendly error -->
            match = re.search(r'''
                <title''' + self.tag_attrs_regex + ur'''\s*>
                    \s*Ошибка\s*
                </title>
                .*
                <div''' +
                    self.tag_attrs_regex +
                    class_attr_regex_template.format(name = 'body') +
                    self.tag_attrs_regex + r'''
                \s*>
                    (.*?)
                </?div
            ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE)

            if match:
                raise ServerError(
                    re.sub('<[^>]*>', '', match.group(1)).replace('<', '').replace('>', '').strip())
            # The server is on maintenance or returned a user friendly error <--


            # Other errors
            raise
Beispiel #5
0
 def __init__(self, ignore_errors=True):
     HTMLPageParser.__init__(self)
     self.__ignore_errors = ignore_errors
Beispiel #6
0
    def parse(self, html):
        '''Parses the specified HTML.'''

        try:
            self.__data = {}
            self.__private_data = {}

            try:
                HTMLPageParser.parse(self, html)
            except _StopParsing:
                pass

            if 'user_name' not in self.__data:
                raise ParseError('Unable to find the user name.')

            if 'posts' not in self.__data:
                raise ParseError('Unable to find the wall.')

            if not self.__data['posts'] and not self.__private_data.get(
                    'wall_is_empty'):
                raise ParseError('Unable to find wall posts.')

            for post in self.__data['posts']:
                if 'title' not in post:
                    LOG.error('Unable to find a title for post %s.',
                              post['url'])
                    post['title'] = self.__data['user_name']

            return self.__data
        except ParseError:
            # Try to understand why we haven't found the wall on the page

            class_attr_regex_template = r'''
                \s+class=(?:
                    {name}
                    |
                    '(?:[^']*\s+)?{name}(?:\s+[^']*)?'
                    |
                    "(?:[^"]*\s+)?{name}(?:\s+[^"]*)?"
                )
            '''

            # It may be a private group
            if re.search(
                    r'''
                <h1''' + self.tag_attrs_regex + r'''
                    \s+id=(?:title|'title'|"title")''' + self.tag_attrs_regex +
                    ur'''
                \s*>
                    \s*Закрытая\s+группа
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise PrivateGroupError()

            # User's profile may be deleted
            if re.search(
                    r'''
                <div''' + self.tag_attrs_regex +
                    class_attr_regex_template.format(name='profile_deleted') +
                    self.tag_attrs_regex + r'''
                \s*>
            ''', html, re.IGNORECASE | re.VERBOSE):
                raise ProfileNotAvailableError()

            # The server is on maintenance or returned a user friendly error -->
            match = re.search(
                r'''
                <title''' + self.tag_attrs_regex + ur'''\s*>
                    \s*Ошибка\s*
                </title>
                .*
                <div''' + self.tag_attrs_regex +
                class_attr_regex_template.format(name='body') +
                self.tag_attrs_regex + r'''
                \s*>
                    (.*?)
                </?div
            ''', html, re.VERBOSE | re.DOTALL | re.IGNORECASE)

            if match:
                raise ServerError(
                    re.sub('<[^>]*>', '',
                           match.group(1)).replace('<',
                                                   '').replace('>',
                                                               '').strip())
            # The server is on maintenance or returned a user friendly error <--

            # Other errors
            raise