Python html_to_xhtml 예제들, clean.html_to_xhtml Python 예제들

예제 #1

0

파일 보기

파일: chapter.py 프로젝트: cigani/pypub

    def create_chapter_from_string(self, html_string, url=None, title=None):
        """
        Creates a Chapter object from a string. Sanitizes the
        string using the clean_function method, and saves
        it as the content of the created chapter.

        Args:
            html_string (string): The html or xhtml content of the created
                Chapter
            url (Option[string]): A url to infer the title of the chapter from
            title (Option[string]): The title of the created Chapter. By
                default, this is None, in which case the title will try to be
                inferred from the webpage at the url.

        Returns:
            Chapter: A chapter object whose content is the given string
                and whose title is that provided or inferred from the url
        """
        clean_html_string = self.clean_function(html_string)
        clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
        if title:
            pass
        else:
            try:
                root = BeautifulSoup(html_string, 'html.parser')
                title_node = root.title
                title = unicode(title_node.string)
            except IndexError:
                title = 'Ebook Chapter'
        return Chapter(clean_xhtml_string, title, url)

예제 #2

0

파일 보기

파일: chapter.py 프로젝트: Redpoint1/pypub

    def create_chapter_from_string(self, html_string, url=None, title=None):
        """
        Creates a Chapter object from a string. Sanitizes the
        string using the clean_function method, and saves
        it as the content of the created chapter.

        Args:
            html_string (string): The html or xhtml content of the created
                Chapter
            url (Option[string]): A url to infer the title of the chapter from
            title (Option[string]): The title of the created Chapter. By
                default, this is None, in which case the title will try to be
                inferred from the webpage at the url.

        Returns:
            Chapter: A chapter object whose content is the given string
                and whose title is that provided or inferred from the url
        """
        clean_html_string = self.clean_function(html_string)
        clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
        if title:
            pass
        else:
            try:
                root = BeautifulSoup(html_string, 'html.parser')
                title_node = root.title
                if title_node is not None:
                    title = title_node.string
                else:
                    raise ValueError
            except (IndexError, ValueError):
                title = 'Ebook Chapter'
        return Chapter(clean_xhtml_string, title, url)

예제 #3

0

파일 보기

파일: unit_tests_clean.py 프로젝트: vasudevram/pypub

 def test_html_to_xhtml(self):
     s = u'<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head></head><body><div id="Test">Hello</div><br /><br /></body></html>'
     s1 = u'''
             <!DOCTYPE html>
             <html>
              <head>
              </head>
              <body>
               <DIV ID="Test">Hello</div>
               <br>
               <br>
              </body>
             </html>
             '''
     self.assertEqual(condense(html_to_xhtml(clean(s1))), s)

예제 #4

0

파일 보기

 def test_html_to_xhtml(self):
     s = u'<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head></head><body><div id="Test">Hello</div><br /><br /></body></html>'
     s1 = u'''
             <!DOCTYPE html>
             <html>
              <head>
              </head>
              <body>
               <DIV ID="Test">Hello</div>
               <br>
               <br>
              </body>
             </html>
             '''
     self.assertEqual(condense(html_to_xhtml(clean(s1))), s)

예제 #5

0

파일 보기

    def create_chapter_from_string(self,
                                   html_string,
                                   url=None,
                                   title=None,
                                   request_object=None):
        """
        Creates a Chapter object from a string. Sanitizes the
        string using the clean_function method, and saves
        it as the content of the created chapter.

        Args:
            html_string (string): The html or xhtml content of the created
                Chapter
            url (Option[string]): A url to infer the title of the chapter from
            title (Option[string]): The title of the created Chapter. By
                default, this is None, in which case the title will try to be
                inferred from the webpage at the url.

        Returns:
            Chapter: A chapter object whose content is the given string
                and whose title is that provided or inferred from the url
        """

        if request_object:
            request_object.encoding = 'utf-8'
            html_string = request_object.text
        elif not html_string:  #if 404, request_object will None
            html_string = '<html></html>'

        clean_html_string = self.clean_function(html_string)
        clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
        if title:
            pass
        else:
            try:

                if request_object:

                    root = BeautifulSoup(html_string, 'html.parser')
                    meta_encoding = hole_meta_encoding(root)
                    if meta_encoding and (meta_encoding.lower() != 'utf-8'):
                        print('Encoding to meta encoding: ' +
                              repr(meta_encoding))
                        request_object.encoding = meta_encoding
                        html_string = request_object.text
                        root = BeautifulSoup(html_string, 'html.parser')
                        clean_html_string = self.clean_function(html_string)
                        clean_xhtml_string = clean.html_to_xhtml(
                            clean_html_string)

                else:
                    root = BeautifulSoup(html_string, 'html.parser')

                title_node = root.title
                if title_node is not None:
                    #title = unicode(title_node.string)
                    title = title_node.string
                else:
                    raise ValueError
            except (IndexError, ValueError):
                title = 'Ebook Chapter'
        return Chapter(clean_xhtml_string, title, url)

예제 #6

0

파일 보기

    def create_chapter_from_string(self,
                                   html_string,
                                   url=None,
                                   title=None,
                                   request_object=None):
        """
        Creates a Chapter object from a string. Sanitizes the
        string using the clean_function method, and saves
        it as the content of the created chapter.

        Args:
            html_string (string): The html or xhtml content of the created
                Chapter
            url (Option[string]): A url to infer the title of the chapter from
            title (Option[string]): The title of the created Chapter. By
                default, this is None, in which case the title will try to be
                inferred from the webpage at the url.

        Returns:
            Chapter: A chapter object whose content is the given string
                and whose title is that provided or inferred from the url
        """
        if request_object:
            # Test case: https://new.qq.com/omn/20180816/20180816A0A0D0.html which return headers "content-type: text/html; charset=GB2312"
            # ... shouldn't make it utf-8
            if not request_object.encoding:  # just in case, default depends on header content-type(alternative to html meta)
                request_object.encoding = 'utf-8'
                html_string = request_object.text
            else:
                # test case(ISO-8859-1): http://castic.xiaoxiaotong.org/2019/studentDetails.html?77061
                try:
                    html_string = request_object.text.encode(
                        request_object.encoding).decode('utf-8')
                except (UnicodeEncodeError, UnicodeDecodeError):
                    # test case(UnicodeEncodeError): https://new.qq.com/omn/20191011/20191011A0Q9JI00.html
                    # test case: https://www.dawuxia.net/forum.php?mod=viewthread&tid=1034211
                    html_string = request_object.text
        elif not html_string:  #if 404, request_object will None
            html_string = '<html></html>'
        #print(html_string)
        clean_html_string = self.clean_function(html_string)
        #print(clean_html_string)
        clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
        if title:
            pass
        else:
            try:
                if request_object:
                    root = BeautifulSoup(html_string, 'html.parser')
                    meta_encoding = hole_meta_encoding(root)
                    #print(meta_encoding)
                    if meta_encoding and (meta_encoding.lower() != 'utf-8'):
                        print('Encoding to meta encoding: ' +
                              repr(meta_encoding))
                        request_object.encoding = meta_encoding
                        html_string = request_object.text
                        root = BeautifulSoup(html_string, 'html.parser')
                        clean_html_string = self.clean_function(html_string)
                        clean_xhtml_string = clean.html_to_xhtml(
                            clean_html_string)

                else:
                    root = BeautifulSoup(html_string, 'html.parser')

                title_node = root.title
                if title_node is not None:
                    #title = unicode(title_node.string)
                    title = title_node.string
                    if title == None:
                        title = 'Unknown title'
                else:
                    raise ValueError
            except (IndexError, ValueError):
                title = 'Ebook Chapter'
        #print(clean_xhtml_string)
        return Chapter(clean_xhtml_string, title, url)