def test_content_meta_attribute_value(self): # The value of a CharsetMetaAttributeValue is whatever # encoding the string is in. value = CharsetMetaAttributeValue("euc-jp") assert "euc-jp" == value assert "euc-jp" == value.original_value assert "utf8" == value.encode("utf8") assert "ascii" == value.encode("ascii")
def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': return False http_equiv = tag.get('http-equiv') content = tag.get('content') charset = tag.get('charset') # We are interested in <meta> tags that say what encoding the # document was originally in. This means HTML 5-style <meta> # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". # # In both cases we will replace the value of the appropriate # attribute with a standin object that can take on any # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None)
def set_up_substitutions(self, tag): """Replace the declared encoding in a <meta> tag with a placeholder, to be substituted when the tag is output to a string. An HTML document may come in to Beautiful Soup as one encoding, but exit in a different encoding, and the <meta> tag needs to be changed to reflect this. :param tag: A `Tag` :return: Whether or not a substitution was performed. """ # We are only interested in <meta> tags if tag.name != "meta": return False http_equiv = tag.get("http-equiv") content = tag.get("content") charset = tag.get("charset") # We are interested in <meta> tags that say what encoding the # document was originally in. This means HTML 5-style <meta> # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". # # In both cases we will replace the value of the appropriate # attribute with a standin object that can take on any # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset tag["charset"] = CharsetMetaAttributeValue(charset) elif ( content is not None and http_equiv is not None and http_equiv.lower() == "content-type" ): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> tag["content"] = ContentMetaAttributeValue(content) return meta_encoding is not None
def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") self.assertEqual("euc-jp", value) self.assertEqual("euc-jp", value.original_value) self.assertEqual("utf8", value.encode("utf8"))