def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token["type"] if type == "Doctype": doctype = "<!DOCTYPE %s>" % token["name"] if encoding: yield doctype.encode(encoding) else: yield doctype elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("</") >= 0: self.serializeError(_("Unexpected </ in CDATA")) if encoding: yield token["data"].encode(encoding, "strict") else: yield token["data"] elif encoding: yield encode(escape(token["data"]), encoding) else: yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) attrs = token["data"] if hasattr(attrs, "items"): attrs = list(attrs.items()) attrs.sort() attributes = [] for k,v in attrs: if encoding: k = k.encode(encoding, "strict") attributes.append(' ') attributes.append(k) if (not self.minimize_boolean_attributes or (k not in booleanAttributes.get(name, tuple()) and k not in booleanAttributes.get("", tuple()))): attributes.append("=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = False for char in spaceCharacters + ">\"'=": if char in v: quote_attr = True break v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") if encoding: v = encode(v, encoding) if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if "'" in v and '"' not in v: quote_char = '"' elif '"' in v and "'" not in v: quote_char = "'" if quote_char == "'": v = v.replace("'", "'") else: v = v.replace('"', """) attributes.append(quote_char) attributes.append(v) attributes.append(quote_char) else: attributes.append(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: attributes.append(" /") else: attributes.append("/") if encoding: yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes)) else: yield "<%s%s>" % (name, "".join(attributes)) elif type == "EndTag": name = token["name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) end_tag = "</%s>" % name if encoding: end_tag = end_tag.encode(encoding, "strict") yield end_tag elif type == "Comment": data = token["data"] if data.find("--") >= 0: self.serializeError(_("Comment contains --")) comment = "<!--%s-->" % token["data"] if encoding: comment = comment.encode(encoding, unicode_encode_errors) yield comment else: self.serializeError(token["data"])
def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token["type"] if type == "Doctype": doctype = u"<!DOCTYPE %s" % token["name"] if token["publicId"]: doctype += u' PUBLIC "%s"' % token["publicId"] elif token["systemId"]: doctype += u" SYSTEM" if token["systemId"]: if token["systemId"].find(u'"') >= 0: if token["systemId"].find(u"'") >= 0: self.serializeError(_("System identifer contains both single and double quote characters")) quote_char = u"'" else: quote_char = u'"' doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) doctype += u">" if encoding: yield doctype.encode(encoding) else: yield doctype elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("</") >= 0: self.serializeError(_("Unexpected </ in CDATA")) if encoding: yield token["data"].encode(encoding, "strict") else: yield token["data"] elif encoding: yield encode(escape(token["data"]), encoding) else: yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() attrs.sort() attributes = [] for k,v in attrs: if encoding: k = k.encode(encoding, "strict") attributes.append(' ') attributes.append(k) if not self.minimize_boolean_attributes or \ (k not in booleanAttributes.get(name, tuple()) \ and k not in booleanAttributes.get("", tuple())): attributes.append("=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = reduce(lambda x,y: x or (y in v), spaceCharacters + ">\"'=", False) v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") if encoding: v = encode(v, encoding) if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if "'" in v and '"' not in v: quote_char = '"' elif '"' in v and "'" not in v: quote_char = "'" if quote_char == "'": v = v.replace("'", "'") else: v = v.replace('"', """) attributes.append(quote_char) attributes.append(v) attributes.append(quote_char) else: attributes.append(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: attributes.append(" /") else: attributes.append("/") if encoding: yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes)) else: yield u"<%s%s>" % (name, u"".join(attributes)) elif type == "EndTag": name = token["name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) end_tag = u"</%s>" % name if encoding: end_tag = end_tag.encode(encoding, "strict") yield end_tag elif type == "Comment": data = token["data"] if data.find("--") >= 0: self.serializeError(_("Comment contains --")) comment = u"<!--%s-->" % token["data"] if encoding: comment = comment.encode(encoding, unicode_encode_errors) yield comment elif type == "Entity": name = token["name"] key = name + ";" if not key in entities: self.serializeError(_("Entity %s not recognized" % name)) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: data = u"&%s;" % name if encoding: data = data.encode(encoding, unicode_encode_errors) yield data else: self.serializeError(token["data"])
def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token["type"] if type == "Doctype": doctype = u"<!DOCTYPE %s" % token["name"] if token["publicId"]: doctype += u' PUBLIC "%s"' % token["publicId"] elif token["systemId"]: doctype += u" SYSTEM" if token["systemId"]: if token["systemId"].find(u'"') >= 0: if token["systemId"].find(u"'") >= 0: self.serializeError( _("System identifer contains both single and double quote characters" )) quote_char = u"'" else: quote_char = u'"' doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) doctype += u">" if encoding: yield doctype.encode(encoding) else: yield doctype elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("</") >= 0: self.serializeError(_("Unexpected </ in CDATA")) if encoding: yield token["data"].encode(encoding, "strict") else: yield token["data"] elif encoding: yield encode(escape(token["data"]), encoding) else: yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError( _("Unexpected child element of a CDATA element")) attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() attrs.sort() attributes = [] for k, v in attrs: if encoding: k = k.encode(encoding, "strict") attributes.append(' ') attributes.append(k) if not self.minimize_boolean_attributes or \ (k not in booleanAttributes.get(name, tuple()) \ and k not in booleanAttributes.get("", tuple())): attributes.append("=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = reduce(lambda x, y: x or (y in v), spaceCharacters + ">\"'=", False) v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") if encoding: v = encode(v, encoding) if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if "'" in v and '"' not in v: quote_char = '"' elif '"' in v and "'" not in v: quote_char = "'" if quote_char == "'": v = v.replace("'", "'") else: v = v.replace('"', """) attributes.append(quote_char) attributes.append(v) attributes.append(quote_char) else: attributes.append(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: attributes.append(" /") else: attributes.append("/") if encoding: yield "<%s%s>" % (name.encode( encoding, "strict"), "".join(attributes)) else: yield u"<%s%s>" % (name, u"".join(attributes)) elif type == "EndTag": name = token["name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError( _("Unexpected child element of a CDATA element")) end_tag = u"</%s>" % name if encoding: end_tag = end_tag.encode(encoding, "strict") yield end_tag elif type == "Comment": data = token["data"] if data.find("--") >= 0: self.serializeError(_("Comment contains --")) comment = u"<!--%s-->" % token["data"] if encoding: comment = comment.encode(encoding, unicode_encode_errors) yield comment elif type == "Entity": name = token["name"] key = name + ";" if not key in entities: self.serializeError(_("Entity %s not recognized" % name)) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: data = u"&%s;" % name if encoding: data = data.encode(encoding, unicode_encode_errors) yield data else: self.serializeError(token["data"])
def serialize(self, treewalker, encoding=None): self.encoding = encoding in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token[u"type"] if type == u"Doctype": doctype = u"<!DOCTYPE %s" % token[u"name"] if token[u"publicId"]: doctype += u' PUBLIC "%s"' % token[u"publicId"] elif token[u"systemId"]: doctype += u" SYSTEM" if token[u"systemId"]: if token[u"systemId"].find(u'"') >= 0: if token[u"systemId"].find(u"'") >= 0: self.serializeError( _(u"System identifer contains both single and double quote characters" )) quote_char = u"'" else: quote_char = u'"' doctype += u" %s%s%s" % (quote_char, token[u"systemId"], quote_char) doctype += u">" yield self.encodeStrict(doctype) elif type in (u"Characters", u"SpaceCharacters"): if type == u"SpaceCharacters" or in_cdata: if in_cdata and token[u"data"].find(u"</") >= 0: self.serializeError(_(u"Unexpected </ in CDATA")) yield self.encode(token[u"data"]) else: yield self.encode(escape(token[u"data"])) elif type in (u"StartTag", u"EmptyTag"): name = token[u"name"] yield self.encodeStrict(u"<%s" % name) if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError( _(u"Unexpected child element of a CDATA element")) attributes = [] for (attr_namespace, attr_name), attr_value in sorted(token[u"data"].items()): #TODO: Add namespace support here k = attr_name v = attr_value yield self.encodeStrict(u' ') yield self.encodeStrict(k) if not self.minimize_boolean_attributes or \ (k not in booleanAttributes.get(name, tuple()) \ and k not in booleanAttributes.get(u"", tuple())): yield self.encodeStrict(u"=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = reduce(lambda x, y: x or (y in v), spaceCharacters + u">\"'=", False) v = v.replace(u"&", u"&") if self.escape_lt_in_attrs: v = v.replace(u"<", u"<") if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if u"'" in v and u'"' not in v: quote_char = u'"' elif u'"' in v and u"'" not in v: quote_char = u"'" if quote_char == u"'": v = v.replace(u"'", u"'") else: v = v.replace(u'"', u""") yield self.encodeStrict(quote_char) yield self.encode(v) yield self.encodeStrict(quote_char) else: yield self.encode(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: yield self.encodeStrict(u" /") else: yield self.encodeStrict(u"/") yield self.encode(u">") elif type == u"EndTag": name = token[u"name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError( _(u"Unexpected child element of a CDATA element")) yield self.encodeStrict(u"</%s>" % name) elif type == u"Comment": data = token[u"data"] if data.find(u"--") >= 0: self.serializeError(_(u"Comment contains --")) yield self.encodeStrict(u"<!--%s-->" % token[u"data"]) elif type == u"Entity": name = token[u"name"] key = name + u";" if not key in entities: self.serializeError(_(u"Entity %s not recognized" % name)) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: data = u"&%s;" % name yield self.encodeStrict(data) else: self.serializeError(token[u"data"])
def serialize(self, treewalker, encoding=None): self.encoding = encoding in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token[u"type"] if type == u"Doctype": doctype = u"<!DOCTYPE %s" % token[u"name"] if token[u"publicId"]: doctype += u' PUBLIC "%s"' % token[u"publicId"] elif token[u"systemId"]: doctype += u" SYSTEM" if token[u"systemId"]: if token[u"systemId"].find(u'"') >= 0: if token[u"systemId"].find(u"'") >= 0: self.serializeError(_(u"System identifer contains both single and double quote characters")) quote_char = u"'" else: quote_char = u'"' doctype += u" %s%s%s" % (quote_char, token[u"systemId"], quote_char) doctype += u">" yield self.encodeStrict(doctype) elif type in (u"Characters", u"SpaceCharacters"): if type == u"SpaceCharacters" or in_cdata: if in_cdata and token[u"data"].find(u"</") >= 0: self.serializeError(_(u"Unexpected </ in CDATA")) yield self.encode(token[u"data"]) else: yield self.encode(escape(token[u"data"])) elif type in (u"StartTag", u"EmptyTag"): name = token[u"name"] yield self.encodeStrict(u"<%s" % name) if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError(_(u"Unexpected child element of a CDATA element")) attributes = [] for (attr_namespace,attr_name),attr_value in sorted(token[u"data"].items()): #TODO: Add namespace support here k = attr_name v = attr_value yield self.encodeStrict(u' ') yield self.encodeStrict(k) if not self.minimize_boolean_attributes or \ (k not in booleanAttributes.get(name, tuple()) \ and k not in booleanAttributes.get(u"", tuple())): yield self.encodeStrict(u"=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = reduce(lambda x,y: x or (y in v), spaceCharacters + u">\"'=", False) v = v.replace(u"&", u"&") if self.escape_lt_in_attrs: v = v.replace(u"<", u"<") if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if u"'" in v and u'"' not in v: quote_char = u'"' elif u'"' in v and u"'" not in v: quote_char = u"'" if quote_char == u"'": v = v.replace(u"'", u"'") else: v = v.replace(u'"', u""") yield self.encodeStrict(quote_char) yield self.encode(v) yield self.encodeStrict(quote_char) else: yield self.encode(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: yield self.encodeStrict(u" /") else: yield self.encodeStrict(u"/") yield self.encode(u">") elif type == u"EndTag": name = token[u"name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError(_(u"Unexpected child element of a CDATA element")) yield self.encodeStrict(u"</%s>" % name) elif type == u"Comment": data = token[u"data"] if data.find(u"--") >= 0: self.serializeError(_(u"Comment contains --")) yield self.encodeStrict(u"<!--%s-->" % token[u"data"]) elif type == u"Entity": name = token[u"name"] key = name + u";" if not key in entities: self.serializeError(_(u"Entity %s not recognized" % name)) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: data = u"&%s;" % name yield self.encodeStrict(data) else: self.serializeError(token[u"data"])