Beispiel #1
0
def _escape_cdata(text, encoding=None):
    # escape character data
    try:
        if encoding:
            try:
                text = text.encode(encoding)
            except UnicodeError:
                return _encode_entity(text)
        text = nonentity_re.sub("&", text)
        text = text.replace("<", "&lt;")
        return text
    except (TypeError, AttributeError):
        _raise_serialization_error(text)
Beispiel #2
0
def _escape_cdata(text, encoding=None):
    # escape character data
    try:
        if encoding:
            try:
                text = text.encode(encoding)
            except UnicodeError:
                return _encode_entity(text)
        text = nonentity_re.sub('&amp;', text)
        text = text.replace("<", "&lt;")
        return text
    except (TypeError, AttributeError):
        _raise_serialization_error(text)
Beispiel #3
0
def _escape_attrib(text, encoding=None):
    # escape attribute value
    try:
        if encoding:
            try:
                text = text.encode(encoding)
            except UnicodeError:
                return _encode_entity(text)
        # don't requote properly-quoted entities
        text = nonentity_re.sub("&amp;", text)
        text = text.replace("<", "&lt;")
        text = text.replace('"', "&quot;")
        return text
    except (TypeError, AttributeError):
        _raise_serialization_error(text)
Beispiel #4
0
def _escape_attrib(text, encoding=None):
    # escape attribute value
    try:
        if encoding:
            try:
                text = text.encode(encoding)
            except UnicodeError:
                return _encode_entity(text)
        # don't requote properly-quoted entities
        text = nonentity_re.sub('&amp;', text)
        text = text.replace("<", "&lt;")
        text = text.replace('"', "&quot;")
        return text
    except (TypeError, AttributeError):
        _raise_serialization_error(text)
Beispiel #5
0
def _write_html(write, node, encoding, namespaces, depth=-1, maxdepth=None):
    " Write HTML to file " ""
    if encoding is None:
        encoding = "utf-8"

    tag = node.tag
    tail = node.tail
    text = node.text
    tail = node.tail

    to_write = ""

    if tag is Replace:
        if not node.structure:
            if cdata_needs_escaping(text):
                text = _escape_cdata(text)
        write(text.encode(encoding))

    elif tag is Comment:
        if cdata_needs_escaping(text):
            text = _escape_cdata(text)
        write("<!-- " + text + " -->".encode(encoding))

    elif tag is ProcessingInstruction:
        if cdata_needs_escaping(text):
            text = _escape_cdata(text)
        write("<!-- " + text + " -->".encode(encoding))

    else:
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                    tag = tag[_XHTML_PREFIX_LEN:]
                else:
                    tag, xmlns = fixtag(tag, namespaces)
                    if xmlns:
                        xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)

        to_write += "<%s" % tag.encode(encoding)

        attrib = node.attrib

        if attrib is not None:
            if len(attrib) > 1:
                attrib_keys = attrib.keys()
                attrib_keys.sort()
            else:
                attrib_keys = attrib
            for k in attrib_keys:
                try:
                    if k[:1] == "{":
                        continue
                except TypeError:
                    _raise_serialization_error(k)
                if k in _HTMLATTRS_BOOLEAN:
                    to_write += " " + k.encode(encoding)
                else:
                    v = attrib[k]
                    to_write += ' %s="%s"' % (k, v)

        for k, v in xmlns_items:
            to_write += ' %s="%s"' % (k, v)

        to_write += ">"

        if text is not None and text:
            if tag in _HTMLTAGS_NOESCAPE:
                to_write += text.encode(encoding)
            elif cdata_needs_escaping(text):
                to_write += _escape_cdata(text)
            else:
                to_write += text.encode(encoding)

        write(to_write)

        for child in node._children:
            if maxdepth is not None:
                depth = depth + 1
                if depth < maxdepth:
                    _write_html(write, child, encoding, namespaces, depth, maxdepth)
                elif depth == maxdepth and text:
                    write(" [...]\n")

            else:
                _write_html(write, child, encoding, namespaces, depth, maxdepth)

        if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
            write("</" + tag.encode(encoding) + ">")

    if tail:
        if cdata_needs_escaping(tail):
            write(_escape_cdata(tail))
        else:
            write(tail.encode(encoding))
Beispiel #6
0
def _write_xml(write, node, encoding, namespaces, pipeline, xhtml=False):
    """ Write XML to a file """
    if encoding is None:
        encoding = "utf-8"
    tag = node.tag
    if tag is Comment:
        write("<!-- %s -->" % _escape_cdata(node.text, encoding))
    elif tag is ProcessingInstruction:
        write("<?%s?>" % _escape_cdata(node.text, encoding))
    elif tag is Replace:
        if node.structure:
            # this may produce invalid xml
            write(node.text.encode(encoding))
        else:
            write(_escape_cdata(node.text, encoding))
    else:
        if xhtml:
            if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                tag = tag[_XHTML_PREFIX_LEN:]
        if node.attrib:
            items = node.attrib.items()
        else:
            items = []  # must always be sortable.
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                tag, xmlns = fixtag(tag, namespaces)
                if xmlns:
                    xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)
        write("<" + tag.encode(encoding))
        if items or xmlns_items:
            items.sort()  # lexical order
            for k, v in items:
                try:
                    if k[:1] == "{":
                        if not pipeline:
                            if k == _MELD_ID:
                                continue
                        k, xmlns = fixtag(k, namespaces)
                        if xmlns:
                            xmlns_items.append(xmlns)
                    if not pipeline:
                        # special-case for HTML input
                        if k == "xmlns:meld":
                            continue
                except TypeError:
                    _raise_serialization_error(k)
                write(' %s="%s"' % (k.encode(encoding), _escape_attrib(v, encoding)))
            for k, v in xmlns_items:
                write(' %s="%s"' % (k.encode(encoding), _escape_attrib(v, encoding)))
        if node.text or node._children:
            write(">")
            if node.text:
                write(_escape_cdata(node.text, encoding))
            for n in node._children:
                _write_xml(write, n, encoding, namespaces, pipeline, xhtml)
            write("</" + tag.encode(encoding) + ">")
        else:
            write(" />")
        for k, v in xmlns_items:
            del namespaces[v]
    if node.tail:
        write(_escape_cdata(node.tail, encoding))
Beispiel #7
0
def _write_html_no_encoding(write, node, namespaces):
    """ Append HTML to string without any particular unicode encoding.
    We have a separate function for this due to the fact that encoding
    while recursing is very expensive if this will get serialized out to
    utf8 anyway (the encoding can happen afterwards).  We append to a string
    because it's faster than calling any 'write' or 'append' function."""

    tag = node.tag
    tail = node.tail
    text = node.text
    tail = node.tail

    to_write = ""

    if tag is Replace:
        if not node.structure:
            if cdata_needs_escaping(text):
                text = _escape_cdata_noencoding(text)
        write(text)

    elif tag is Comment:
        if cdata_needs_escaping(text):
            text = _escape_cdata_noencoding(text)
        write("<!-- " + text + " -->")

    elif tag is ProcessingInstruction:
        if cdata_needs_escaping(text):
            text = _escape_cdata_noencoding(text)
        write("<!-- " + text + " -->")

    else:
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                    tag = tag[_XHTML_PREFIX_LEN:]
                else:
                    tag, xmlns = fixtag(tag, namespaces)
                    if xmlns:
                        xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)

        to_write += "<" + tag

        attrib = node.attrib

        if attrib is not None:
            if len(attrib) > 1:
                attrib_keys = attrib.keys()
                attrib_keys.sort()

            else:
                attrib_keys = attrib
            for k in attrib_keys:
                try:
                    if k[:1] == "{":
                        continue
                except TypeError:
                    _raise_serialization_error(k)
                if k in _HTMLATTRS_BOOLEAN:
                    to_write += " " + k
                else:
                    v = attrib[k]
                    to_write += ' %s="%s"' % (k, v)

        for k, v in xmlns_items:
            to_write += ' %s="%s"' % (k, v)

        to_write += ">"

        if text is not None and text:
            if tag in _HTMLTAGS_NOESCAPE:
                to_write += text
            elif cdata_needs_escaping(text):
                to_write += _escape_cdata_noencoding(text)
            else:
                to_write += text

        write(to_write)

        for child in node._children:
            _write_html_no_encoding(write, child, namespaces)

        if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
            write("</" + tag + ">")

    if tail:
        if cdata_needs_escaping(tail):
            write(_escape_cdata_noencoding(tail))
        else:
            write(tail)
Beispiel #8
0
def _write_xml(write, node, encoding, namespaces, pipeline, xhtml=False):
    """ Write XML to a file """
    if encoding is None:
        encoding = 'utf-8'
    tag = node.tag
    if tag is Comment:
        write("<!-- %s -->" % _escape_cdata(node.text, encoding))
    elif tag is ProcessingInstruction:
        write("<?%s?>" % _escape_cdata(node.text, encoding))
    elif tag is Replace:
        if node.structure:
            # this may produce invalid xml
            write(node.text.encode(encoding))
        else:
            write(_escape_cdata(node.text, encoding))
    else:
        if xhtml:
            if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                tag = tag[_XHTML_PREFIX_LEN:]
        if node.attrib:
            items = node.attrib.items()
        else:
            items = []  # must always be sortable.
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                tag, xmlns = fixtag(tag, namespaces)
                if xmlns:
                    xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)
        write("<" + tag.encode(encoding))
        if items or xmlns_items:
            items.sort()  # lexical order
            for k, v in items:
                try:
                    if k[:1] == "{":
                        if not pipeline:
                            if k == _MELD_ID:
                                continue
                        k, xmlns = fixtag(k, namespaces)
                        if xmlns: xmlns_items.append(xmlns)
                    if not pipeline:
                        # special-case for HTML input
                        if k == 'xmlns:meld':
                            continue
                except TypeError:
                    _raise_serialization_error(k)
                write(" %s=\"%s\"" %
                      (k.encode(encoding), _escape_attrib(v, encoding)))
            for k, v in xmlns_items:
                write(" %s=\"%s\"" %
                      (k.encode(encoding), _escape_attrib(v, encoding)))
        if node.text or node._children:
            write(">")
            if node.text:
                write(_escape_cdata(node.text, encoding))
            for n in node._children:
                _write_xml(write, n, encoding, namespaces, pipeline, xhtml)
            write("</" + tag.encode(encoding) + ">")
        else:
            write(" />")
        for k, v in xmlns_items:
            del namespaces[v]
    if node.tail:
        write(_escape_cdata(node.tail, encoding))
Beispiel #9
0
def _write_html_no_encoding(write, node, namespaces):
    """ Append HTML to string without any particular unicode encoding.
    We have a separate function for this due to the fact that encoding
    while recursing is very expensive if this will get serialized out to
    utf8 anyway (the encoding can happen afterwards).  We append to a string
    because it's faster than calling any 'write' or 'append' function."""

    tag = node.tag
    tail = node.tail
    text = node.text
    tail = node.tail

    to_write = ""

    if tag is Replace:
        if not node.structure:
            if cdata_needs_escaping(text):
                text = _escape_cdata_noencoding(text)
        write(text)

    elif tag is Comment:
        if cdata_needs_escaping(text):
            text = _escape_cdata_noencoding(text)
        write('<!-- ' + text + ' -->')

    elif tag is ProcessingInstruction:
        if cdata_needs_escaping(text):
            text = _escape_cdata_noencoding(text)
        write('<!-- ' + text + ' -->')

    else:
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                    tag = tag[_XHTML_PREFIX_LEN:]
                else:
                    tag, xmlns = fixtag(tag, namespaces)
                    if xmlns:
                        xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)

        to_write += "<" + tag

        attrib = node.attrib

        if attrib is not None:
            if len(attrib) > 1:
                attrib_keys = attrib.keys()
                attrib_keys.sort()

            else:
                attrib_keys = attrib
            for k in attrib_keys:
                try:
                    if k[:1] == "{":
                        continue
                except TypeError:
                    _raise_serialization_error(k)
                if k in _HTMLATTRS_BOOLEAN:
                    to_write += ' ' + k
                else:
                    v = attrib[k]
                    to_write += " %s=\"%s\"" % (k, v)

        for k, v in xmlns_items:
            to_write += " %s=\"%s\"" % (k, v)

        to_write += ">"

        if text is not None and text:
            if tag in _HTMLTAGS_NOESCAPE:
                to_write += text
            elif cdata_needs_escaping(text):
                to_write += _escape_cdata_noencoding(text)
            else:
                to_write += text

        write(to_write)

        for child in node._children:
            _write_html_no_encoding(write, child, namespaces)

        if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
            write("</" + tag + ">")

    if tail:
        if cdata_needs_escaping(tail):
            write(_escape_cdata_noencoding(tail))
        else:
            write(tail)
Beispiel #10
0
def _write_html(write, node, encoding, namespaces, depth=-1, maxdepth=None):
    " Write HTML to file " ""
    if encoding is None:
        encoding = 'utf-8'

    tag = node.tag
    tail = node.tail
    text = node.text
    tail = node.tail

    to_write = ""

    if tag is Replace:
        if not node.structure:
            if cdata_needs_escaping(text):
                text = _escape_cdata(text)
        write(text.encode(encoding))

    elif tag is Comment:
        if cdata_needs_escaping(text):
            text = _escape_cdata(text)
        write('<!-- ' + text + ' -->'.encode(encoding))

    elif tag is ProcessingInstruction:
        if cdata_needs_escaping(text):
            text = _escape_cdata(text)
        write('<!-- ' + text + ' -->'.encode(encoding))

    else:
        xmlns_items = []  # new namespaces in this scope
        try:
            if tag[:1] == "{":
                if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
                    tag = tag[_XHTML_PREFIX_LEN:]
                else:
                    tag, xmlns = fixtag(tag, namespaces)
                    if xmlns:
                        xmlns_items.append(xmlns)
        except TypeError:
            _raise_serialization_error(tag)

        to_write += "<%s" % tag.encode(encoding)

        attrib = node.attrib

        if attrib is not None:
            if len(attrib) > 1:
                attrib_keys = attrib.keys()
                attrib_keys.sort()
            else:
                attrib_keys = attrib
            for k in attrib_keys:
                try:
                    if k[:1] == "{":
                        continue
                except TypeError:
                    _raise_serialization_error(k)
                if k in _HTMLATTRS_BOOLEAN:
                    to_write += ' ' + k.encode(encoding)
                else:
                    v = attrib[k]
                    to_write += " %s=\"%s\"" % (k, v)

        for k, v in xmlns_items:
            to_write += " %s=\"%s\"" % (k, v)

        to_write += ">"

        if text is not None and text:
            if tag in _HTMLTAGS_NOESCAPE:
                to_write += text.encode(encoding)
            elif cdata_needs_escaping(text):
                to_write += _escape_cdata(text)
            else:
                to_write += text.encode(encoding)

        write(to_write)

        for child in node._children:
            if maxdepth is not None:
                depth = depth + 1
                if depth < maxdepth:
                    _write_html(write, child, encoding, namespaces, depth,
                                maxdepth)
                elif depth == maxdepth and text:
                    write(' [...]\n')

            else:
                _write_html(write, child, encoding, namespaces, depth,
                            maxdepth)

        if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
            write("</" + tag.encode(encoding) + ">")

    if tail:
        if cdata_needs_escaping(tail):
            write(_escape_cdata(tail))
        else:
            write(tail.encode(encoding))