Ejemplo n.º 1
0
    def prepare_pml(self, pml):
        # Give Chapters the form \\*='text'text\\*. This is used for generating
        # the TOC later.
        pml = re.sub(r'(?msu)(?P<c>\\x)(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' %
                     (match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml)
        pml = re.sub(r'(?msu)(?P<c>\\X[0-4])(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' %
                     (match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml)

        # Remove comments
        pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml)

        # Remove extra white spaces.
        pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml)
        pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml)
        pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml)
        pml = re.sub(r'(?mus)^[ ]*$', '', pml)

        # Footnotes and Sidebars.
        pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="%s"%s\\FN' %
                     (match.group('target'), match.group('text')) if match.group('text') else '', pml)
        pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="%s"%s\\SB' %
                     (match.group('target'), match.group('text')) if match.group('text') else '', pml)

        # Convert &'s into entities so &amp; in the text doesn't get turned into
        # &. It will display as &amp;
        pml = pml.replace('&', '&amp;')

        # Replace \\a and \\U with either the unicode character or the entity.
        pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
        pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)

        pml = prepare_string_for_xml(pml)

        return pml
Ejemplo n.º 2
0
def tostring(raw, **kwargs):
    ''' lxml *sometimes* represents non-ascii characters as hex entities in
    attribute values. I can't figure out exactly what circumstances cause it.
    It seems to happen when serializing a part of a larger tree. Since we need
    serialization to be the same when serializing full and partial trees, we
    manually replace all hex entities with their unicode codepoints. '''

    xml_declaration = kwargs.pop('xml_declaration', False)
    encoding = kwargs.pop('encoding', 'UTF-8')
    kwargs['encoding'] = unicode_type
    kwargs['xml_declaration'] = False
    ans = etree.tostring(raw, **kwargs)
    if xml_declaration:
        ans = '<?xml version="1.0" encoding="%s"?>\n' % encoding + ans
    return re.sub(r'&#x([0-9A-Fa-f]+);',
                  lambda m: my_unichr(int(m.group(1), 16)),
                  ans).encode(encoding)