def create_chapter_from_string(self, html_string, url=None, title=None): """ Creates a Chapter object from a string. Sanitizes the string using the clean_function method, and saves it as the content of the created chapter. Args: html_string (string): The html or xhtml content of the created Chapter url (Option[string]): A url to infer the title of the chapter from title (Option[string]): The title of the created Chapter. By default, this is None, in which case the title will try to be inferred from the webpage at the url. Returns: Chapter: A chapter object whose content is the given string and whose title is that provided or inferred from the url """ clean_html_string = self.clean_function(html_string) clean_xhtml_string = clean.html_to_xhtml(clean_html_string) if title: pass else: try: root = BeautifulSoup(html_string, 'html.parser') title_node = root.title title = unicode(title_node.string) except IndexError: title = 'Ebook Chapter' return Chapter(clean_xhtml_string, title, url)
def create_chapter_from_string(self, html_string, url=None, title=None): """ Creates a Chapter object from a string. Sanitizes the string using the clean_function method, and saves it as the content of the created chapter. Args: html_string (string): The html or xhtml content of the created Chapter url (Option[string]): A url to infer the title of the chapter from title (Option[string]): The title of the created Chapter. By default, this is None, in which case the title will try to be inferred from the webpage at the url. Returns: Chapter: A chapter object whose content is the given string and whose title is that provided or inferred from the url """ clean_html_string = self.clean_function(html_string) clean_xhtml_string = clean.html_to_xhtml(clean_html_string) if title: pass else: try: root = BeautifulSoup(html_string, 'html.parser') title_node = root.title if title_node is not None: title = title_node.string else: raise ValueError except (IndexError, ValueError): title = 'Ebook Chapter' return Chapter(clean_xhtml_string, title, url)
def test_html_to_xhtml(self): s = u'<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head></head><body><div id="Test">Hello</div><br /><br /></body></html>' s1 = u''' <!DOCTYPE html> <html> <head> </head> <body> <DIV ID="Test">Hello</div> <br> <br> </body> </html> ''' self.assertEqual(condense(html_to_xhtml(clean(s1))), s)
def create_chapter_from_string(self, html_string, url=None, title=None, request_object=None): """ Creates a Chapter object from a string. Sanitizes the string using the clean_function method, and saves it as the content of the created chapter. Args: html_string (string): The html or xhtml content of the created Chapter url (Option[string]): A url to infer the title of the chapter from title (Option[string]): The title of the created Chapter. By default, this is None, in which case the title will try to be inferred from the webpage at the url. Returns: Chapter: A chapter object whose content is the given string and whose title is that provided or inferred from the url """ if request_object: request_object.encoding = 'utf-8' html_string = request_object.text elif not html_string: #if 404, request_object will None html_string = '<html></html>' clean_html_string = self.clean_function(html_string) clean_xhtml_string = clean.html_to_xhtml(clean_html_string) if title: pass else: try: if request_object: root = BeautifulSoup(html_string, 'html.parser') meta_encoding = hole_meta_encoding(root) if meta_encoding and (meta_encoding.lower() != 'utf-8'): print('Encoding to meta encoding: ' + repr(meta_encoding)) request_object.encoding = meta_encoding html_string = request_object.text root = BeautifulSoup(html_string, 'html.parser') clean_html_string = self.clean_function(html_string) clean_xhtml_string = clean.html_to_xhtml( clean_html_string) else: root = BeautifulSoup(html_string, 'html.parser') title_node = root.title if title_node is not None: #title = unicode(title_node.string) title = title_node.string else: raise ValueError except (IndexError, ValueError): title = 'Ebook Chapter' return Chapter(clean_xhtml_string, title, url)
def create_chapter_from_string(self, html_string, url=None, title=None, request_object=None): """ Creates a Chapter object from a string. Sanitizes the string using the clean_function method, and saves it as the content of the created chapter. Args: html_string (string): The html or xhtml content of the created Chapter url (Option[string]): A url to infer the title of the chapter from title (Option[string]): The title of the created Chapter. By default, this is None, in which case the title will try to be inferred from the webpage at the url. Returns: Chapter: A chapter object whose content is the given string and whose title is that provided or inferred from the url """ if request_object: # Test case: https://new.qq.com/omn/20180816/20180816A0A0D0.html which return headers "content-type: text/html; charset=GB2312" # ... shouldn't make it utf-8 if not request_object.encoding: # just in case, default depends on header content-type(alternative to html meta) request_object.encoding = 'utf-8' html_string = request_object.text else: # test case(ISO-8859-1): http://castic.xiaoxiaotong.org/2019/studentDetails.html?77061 try: html_string = request_object.text.encode( request_object.encoding).decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): # test case(UnicodeEncodeError): https://new.qq.com/omn/20191011/20191011A0Q9JI00.html # test case: https://www.dawuxia.net/forum.php?mod=viewthread&tid=1034211 html_string = request_object.text elif not html_string: #if 404, request_object will None html_string = '<html></html>' #print(html_string) clean_html_string = self.clean_function(html_string) #print(clean_html_string) clean_xhtml_string = clean.html_to_xhtml(clean_html_string) if title: pass else: try: if request_object: root = BeautifulSoup(html_string, 'html.parser') meta_encoding = hole_meta_encoding(root) #print(meta_encoding) if meta_encoding and (meta_encoding.lower() != 'utf-8'): print('Encoding to meta encoding: ' + repr(meta_encoding)) request_object.encoding = meta_encoding html_string = request_object.text root = BeautifulSoup(html_string, 'html.parser') clean_html_string = self.clean_function(html_string) clean_xhtml_string = clean.html_to_xhtml( clean_html_string) else: root = BeautifulSoup(html_string, 'html.parser') title_node = root.title if title_node is not None: #title = unicode(title_node.string) title = title_node.string if title == None: title = 'Unknown title' else: raise ValueError except (IndexError, ValueError): title = 'Ebook Chapter' #print(clean_xhtml_string) return Chapter(clean_xhtml_string, title, url)