Exemple #1
0
    def clean_text(self, text):
        # Remove excessive \p tags
        text = re.sub(r'\\p\s*\\p', '', text)

        # Remove anchors that do not have links
        anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
        links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
        for unused in anchors.difference(links):
            text = text.replace('\\Q="%s"' % unused, '')

        # Remove \Cn tags that are within \x and \Xn tags
        text = re.sub(
            unicode_type(
                r'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)'
            ), '\\g<t>\\g<a>\\g<b>\\g<t>', text)

        # Replace bad characters.
        text = text.replace(u'\xc2', '')
        text = text.replace(u'\xa0', ' ')

        # Turn all characters that cannot be represented by themself into their
        # PML code equivelent
        text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)

        # Remove excess spaces at beginning and end of lines
        text = re.sub('(?m)^[ ]+', '', text)
        text = re.sub('(?m)[ ]+$', '', text)

        # Remove excessive spaces
        text = re.sub('[ ]{2,}', ' ', text)

        # Condense excessive \c empty line sequences.
        text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)

        # Remove excessive newlines.
        text = re.sub('\n[ ]+\n', '\n\n', text)
        if self.opts.remove_paragraph_spacing:
            text = re.sub('\n{2,}', '\n', text)
            # Only indent lines that don't have special formatting
            text = re.sub(
                '(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
                if re.search(r'\\[XxCmrctTp]', mo.group('text')) else
                '        %s' % mo.group('text'), text)
        else:
            text = re.sub('\n{3,}', '\n\n', text)

        return text
Exemple #2
0
    def clean_text(self, text):
        # Remove excessive \p tags
        text = re.sub(r'\\p\s*\\p', '', text)

        # Remove anchors that do not have links
        anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
        links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
        for unused in anchors.difference(links):
            text = text.replace('\\Q="%s"' % unused, '')

        # Remove \Cn tags that are within \x and \Xn tags
        text = re.sub(ur'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)', '\g<t>\g<a>\g<b>\g<t>', text)

        # Replace bad characters.
        text = text.replace(u'\xc2', '')
        text = text.replace(u'\xa0', ' ')

        # Turn all characters that cannot be represented by themself into their
        # PML code equivelent
        text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)

        # Remove excess spaces at beginning and end of lines
        text = re.sub('(?m)^[ ]+', '', text)
        text = re.sub('(?m)[ ]+$', '', text)

        # Remove excessive spaces
        text = re.sub('[ ]{2,}', ' ', text)

        # Condense excessive \c empty line sequences.
        text = re.sub('(\\c\s*\\c\s*){2,}', '\\c \n\\c\n', text)

        # Remove excessive newlines.
        text = re.sub('\n[ ]+\n', '\n\n', text)
        if self.opts.remove_paragraph_spacing:
            text = re.sub('\n{2,}', '\n', text)
            # Only indent lines that don't have special formatting
            text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
                          if re.search(r'\\[XxCmrctTp]', mo.group('text')) else '        %s' % mo.group('text'), text)
        else:
            text = re.sub('\n{3,}', '\n\n', text)

        return text