Esempio n. 1
0
    def serialize(self, treewalker, encoding=None):
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
                doctype = "<!DOCTYPE %s>" % token["name"]
                if encoding:
                    yield doctype.encode(encoding)
                else:
                    yield doctype

            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError(_("Unexpected </ in CDATA"))
                    if encoding:
                        yield token["data"].encode(encoding, "strict")
                    else:
                        yield token["data"]
                elif encoding:
                    yield encode(escape(token["data"]), encoding)
                else:
                    yield escape(token["data"])

            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                attrs = token["data"]
                if hasattr(attrs, "items"):
                    attrs = list(attrs.items())
                attrs.sort()
                attributes = []
                for k,v in attrs:
                    if encoding:
                        k = k.encode(encoding, "strict")
                    attributes.append(' ')

                    attributes.append(k)
                    if (not self.minimize_boolean_attributes or
                            (k not in booleanAttributes.get(name, tuple())
                             and k not in booleanAttributes.get("", tuple()))):
                        attributes.append("=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = False
                            for char in spaceCharacters + ">\"'=":
                                if char in v:
                                    quote_attr = True
                                    break
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                        if encoding:
                            v = encode(v, encoding)
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if "'" in v and '"' not in v:
                                    quote_char = '"'
                                elif '"' in v and "'" not in v:
                                    quote_char = "'"
                            if quote_char == "'":
                                v = v.replace("'", "&#39;")
                            else:
                                v = v.replace('"', "&quot;")
                            attributes.append(quote_char)
                            attributes.append(v)
                            attributes.append(quote_char)
                        else:
                            attributes.append(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        attributes.append(" /")
                    else:
                        attributes.append("/")
                if encoding:
                    yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
                else:
                    yield "<%s%s>" % (name, "".join(attributes))

            elif type == "EndTag":
                name = token["name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                end_tag = "</%s>" % name
                if encoding:
                    end_tag = end_tag.encode(encoding, "strict")
                yield end_tag

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
                comment = "<!--%s-->" % token["data"]
                if encoding:
                    comment = comment.encode(encoding, unicode_encode_errors)
                yield comment

            else:
                self.serializeError(token["data"])
    def serialize(self, treewalker, encoding=None):
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
                doctype = u"<!DOCTYPE %s" % token["name"]
                
                if token["publicId"]:
                    doctype += u' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
                    doctype += u" SYSTEM"
                if token["systemId"]:                
                    if token["systemId"].find(u'"') >= 0:
                        if token["systemId"].find(u"'") >= 0:
                            self.serializeError(_("System identifer contains both single and double quote characters"))
                        quote_char = u"'"
                    else:
                        quote_char = u'"'
                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
                
                doctype += u">"
                
                if encoding:
                    yield doctype.encode(encoding)
                else:
                    yield doctype

            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError(_("Unexpected </ in CDATA"))
                    if encoding:
                        yield token["data"].encode(encoding, "strict")
                    else:
                        yield token["data"]
                elif encoding:
                    yield encode(escape(token["data"]), encoding)
                else:
                    yield escape(token["data"])

            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                attrs = token["data"]
                if hasattr(attrs, "items"):
                    attrs = attrs.items()
                attrs.sort()
                attributes = []
                for k,v in attrs:
                    if encoding:
                        k = k.encode(encoding, "strict")
                    attributes.append(' ')

                    attributes.append(k)
                    if not self.minimize_boolean_attributes or \
                      (k not in booleanAttributes.get(name, tuple()) \
                      and k not in booleanAttributes.get("", tuple())):
                        attributes.append("=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x,y: x or (y in v),
                                spaceCharacters + ">\"'=", False)
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                        if encoding:
                            v = encode(v, encoding)
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if "'" in v and '"' not in v:
                                    quote_char = '"'
                                elif '"' in v and "'" not in v:
                                    quote_char = "'"
                            if quote_char == "'":
                                v = v.replace("'", "&#39;")
                            else:
                                v = v.replace('"', "&quot;")
                            attributes.append(quote_char)
                            attributes.append(v)
                            attributes.append(quote_char)
                        else:
                            attributes.append(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        attributes.append(" /")
                    else:
                        attributes.append("/")
                if encoding:
                    yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
                else:
                    yield u"<%s%s>" % (name, u"".join(attributes))

            elif type == "EndTag":
                name = token["name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                end_tag = u"</%s>" % name
                if encoding:
                    end_tag = end_tag.encode(encoding, "strict")
                yield end_tag

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
                comment = u"<!--%s-->" % token["data"]
                if encoding:
                    comment = comment.encode(encoding, unicode_encode_errors)
                yield comment

            elif type == "Entity":
                name = token["name"]
                key = name + ";"
                if not key in entities:
                    self.serializeError(_("Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = u"&%s;" % name
                if encoding:
                    data = data.encode(encoding, unicode_encode_errors)
                yield data

            else:
                self.serializeError(token["data"])
Esempio n. 3
0
    def serialize(self, treewalker, encoding=None):
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
                doctype = u"<!DOCTYPE %s" % token["name"]

                if token["publicId"]:
                    doctype += u' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
                    doctype += u" SYSTEM"
                if token["systemId"]:
                    if token["systemId"].find(u'"') >= 0:
                        if token["systemId"].find(u"'") >= 0:
                            self.serializeError(
                                _("System identifer contains both single and double quote characters"
                                  ))
                        quote_char = u"'"
                    else:
                        quote_char = u'"'
                    doctype += u" %s%s%s" % (quote_char, token["systemId"],
                                             quote_char)

                doctype += u">"

                if encoding:
                    yield doctype.encode(encoding)
                else:
                    yield doctype

            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError(_("Unexpected </ in CDATA"))
                    if encoding:
                        yield token["data"].encode(encoding, "strict")
                    else:
                        yield token["data"]
                elif encoding:
                    yield encode(escape(token["data"]), encoding)
                else:
                    yield escape(token["data"])

            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(
                        _("Unexpected child element of a CDATA element"))
                attrs = token["data"]
                if hasattr(attrs, "items"):
                    attrs = attrs.items()
                attrs.sort()
                attributes = []
                for k, v in attrs:
                    if encoding:
                        k = k.encode(encoding, "strict")
                    attributes.append(' ')

                    attributes.append(k)
                    if not self.minimize_boolean_attributes or \
                      (k not in booleanAttributes.get(name, tuple()) \
                      and k not in booleanAttributes.get("", tuple())):
                        attributes.append("=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x, y: x or (y in v),
                                                spaceCharacters + ">\"'=",
                                                False)
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                        if encoding:
                            v = encode(v, encoding)
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if "'" in v and '"' not in v:
                                    quote_char = '"'
                                elif '"' in v and "'" not in v:
                                    quote_char = "'"
                            if quote_char == "'":
                                v = v.replace("'", "&#39;")
                            else:
                                v = v.replace('"', "&quot;")
                            attributes.append(quote_char)
                            attributes.append(v)
                            attributes.append(quote_char)
                        else:
                            attributes.append(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        attributes.append(" /")
                    else:
                        attributes.append("/")
                if encoding:
                    yield "<%s%s>" % (name.encode(
                        encoding, "strict"), "".join(attributes))
                else:
                    yield u"<%s%s>" % (name, u"".join(attributes))

            elif type == "EndTag":
                name = token["name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(
                        _("Unexpected child element of a CDATA element"))
                end_tag = u"</%s>" % name
                if encoding:
                    end_tag = end_tag.encode(encoding, "strict")
                yield end_tag

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
                comment = u"<!--%s-->" % token["data"]
                if encoding:
                    comment = comment.encode(encoding, unicode_encode_errors)
                yield comment

            elif type == "Entity":
                name = token["name"]
                key = name + ";"
                if not key in entities:
                    self.serializeError(_("Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = u"&%s;" % name
                if encoding:
                    data = data.encode(encoding, unicode_encode_errors)
                yield data

            else:
                self.serializeError(token["data"])
Esempio n. 4
0
    def serialize(self, treewalker, encoding=None):
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token[u"type"]
            if type == u"Doctype":
                doctype = u"<!DOCTYPE %s" % token[u"name"]

                if token[u"publicId"]:
                    doctype += u' PUBLIC "%s"' % token[u"publicId"]
                elif token[u"systemId"]:
                    doctype += u" SYSTEM"
                if token[u"systemId"]:
                    if token[u"systemId"].find(u'"') >= 0:
                        if token[u"systemId"].find(u"'") >= 0:
                            self.serializeError(
                                _(u"System identifer contains both single and double quote characters"
                                  ))
                        quote_char = u"'"
                    else:
                        quote_char = u'"'
                    doctype += u" %s%s%s" % (quote_char, token[u"systemId"],
                                             quote_char)

                doctype += u">"
                yield self.encodeStrict(doctype)

            elif type in (u"Characters", u"SpaceCharacters"):
                if type == u"SpaceCharacters" or in_cdata:
                    if in_cdata and token[u"data"].find(u"</") >= 0:
                        self.serializeError(_(u"Unexpected </ in CDATA"))
                    yield self.encode(token[u"data"])
                else:
                    yield self.encode(escape(token[u"data"]))

            elif type in (u"StartTag", u"EmptyTag"):
                name = token[u"name"]
                yield self.encodeStrict(u"<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(
                        _(u"Unexpected child element of a CDATA element"))
                attributes = []
                for (attr_namespace,
                     attr_name), attr_value in sorted(token[u"data"].items()):
                    #TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
                    yield self.encodeStrict(u' ')

                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
                      (k not in booleanAttributes.get(name, tuple()) \
                      and k not in booleanAttributes.get(u"", tuple())):
                        yield self.encodeStrict(u"=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x, y: x or (y in v),
                                                spaceCharacters + u">\"'=",
                                                False)
                        v = v.replace(u"&", u"&amp;")
                        if self.escape_lt_in_attrs:
                            v = v.replace(u"<", u"&lt;")
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if u"'" in v and u'"' not in v:
                                    quote_char = u'"'
                                elif u'"' in v and u"'" not in v:
                                    quote_char = u"'"
                            if quote_char == u"'":
                                v = v.replace(u"'", u"&#39;")
                            else:
                                v = v.replace(u'"', u"&quot;")
                            yield self.encodeStrict(quote_char)
                            yield self.encode(v)
                            yield self.encodeStrict(quote_char)
                        else:
                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        yield self.encodeStrict(u" /")
                    else:
                        yield self.encodeStrict(u"/")
                yield self.encode(u">")

            elif type == u"EndTag":
                name = token[u"name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(
                        _(u"Unexpected child element of a CDATA element"))
                yield self.encodeStrict(u"</%s>" % name)

            elif type == u"Comment":
                data = token[u"data"]
                if data.find(u"--") >= 0:
                    self.serializeError(_(u"Comment contains --"))
                yield self.encodeStrict(u"<!--%s-->" % token[u"data"])

            elif type == u"Entity":
                name = token[u"name"]
                key = name + u";"
                if not key in entities:
                    self.serializeError(_(u"Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = u"&%s;" % name
                yield self.encodeStrict(data)

            else:
                self.serializeError(token[u"data"])
    def serialize(self, treewalker, encoding=None):
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token[u"type"]
            if type == u"Doctype":
                doctype = u"<!DOCTYPE %s" % token[u"name"]
                
                if token[u"publicId"]:
                    doctype += u' PUBLIC "%s"' % token[u"publicId"]
                elif token[u"systemId"]:
                    doctype += u" SYSTEM"
                if token[u"systemId"]:                
                    if token[u"systemId"].find(u'"') >= 0:
                        if token[u"systemId"].find(u"'") >= 0:
                            self.serializeError(_(u"System identifer contains both single and double quote characters"))
                        quote_char = u"'"
                    else:
                        quote_char = u'"'
                    doctype += u" %s%s%s" % (quote_char, token[u"systemId"], quote_char)
                
                doctype += u">"
                yield self.encodeStrict(doctype)

            elif type in (u"Characters", u"SpaceCharacters"):
                if type == u"SpaceCharacters" or in_cdata:
                    if in_cdata and token[u"data"].find(u"</") >= 0:
                        self.serializeError(_(u"Unexpected </ in CDATA"))
                    yield self.encode(token[u"data"])
                else:
                    yield self.encode(escape(token[u"data"]))

            elif type in (u"StartTag", u"EmptyTag"):
                name = token[u"name"]
                yield self.encodeStrict(u"<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_(u"Unexpected child element of a CDATA element"))
                attributes = []
                for (attr_namespace,attr_name),attr_value in sorted(token[u"data"].items()):
                    #TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
                    yield self.encodeStrict(u' ')

                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
                      (k not in booleanAttributes.get(name, tuple()) \
                      and k not in booleanAttributes.get(u"", tuple())):
                        yield self.encodeStrict(u"=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x,y: x or (y in v),
                                spaceCharacters + u">\"'=", False)
                        v = v.replace(u"&", u"&amp;")
                        if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if u"'" in v and u'"' not in v:
                                    quote_char = u'"'
                                elif u'"' in v and u"'" not in v:
                                    quote_char = u"'"
                            if quote_char == u"'":
                                v = v.replace(u"'", u"&#39;")
                            else:
                                v = v.replace(u'"', u"&quot;")
                            yield self.encodeStrict(quote_char)
                            yield self.encode(v)
                            yield self.encodeStrict(quote_char)
                        else:
                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        yield self.encodeStrict(u" /")
                    else:
                        yield self.encodeStrict(u"/")
                yield self.encode(u">")

            elif type == u"EndTag":
                name = token[u"name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_(u"Unexpected child element of a CDATA element"))
                yield self.encodeStrict(u"</%s>" % name)

            elif type == u"Comment":
                data = token[u"data"]
                if data.find(u"--") >= 0:
                    self.serializeError(_(u"Comment contains --"))
                yield self.encodeStrict(u"<!--%s-->" % token[u"data"])

            elif type == u"Entity":
                name = token[u"name"]
                key = name + u";"
                if not key in entities:
                    self.serializeError(_(u"Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = u"&%s;" % name
                yield self.encodeStrict(data)

            else:
                self.serializeError(token[u"data"])