def clean_text(self, text): # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) # Remove anchors that do not have links anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') # Remove \Cn tags that are within \x and \Xn tags text = re.sub( unicode_type( r'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)' ), '\\g<t>\\g<a>\\g<b>\\g<t>', text) # Replace bad characters. text = text.replace(u'\xc2', '') text = text.replace(u'\xa0', ' ') # Turn all characters that cannot be represented by themself into their # PML code equivelent text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text) text = re.sub('(?m)[ ]+$', '', text) # Remove excessive spaces text = re.sub('[ ]{2,}', ' ', text) # Condense excessive \c empty line sequences. text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) # Only indent lines that don't have special formatting text = re.sub( '(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) else: text = re.sub('\n{3,}', '\n\n', text) return text
def clean_text(self, text): # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) # Remove anchors that do not have links anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') # Remove \Cn tags that are within \x and \Xn tags text = re.sub(ur'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)', '\g<t>\g<a>\g<b>\g<t>', text) # Replace bad characters. text = text.replace(u'\xc2', '') text = text.replace(u'\xa0', ' ') # Turn all characters that cannot be represented by themself into their # PML code equivelent text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text) text = re.sub('(?m)[ ]+$', '', text) # Remove excessive spaces text = re.sub('[ ]{2,}', ' ', text) # Condense excessive \c empty line sequences. text = re.sub('(\\c\s*\\c\s*){2,}', '\\c \n\\c\n', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) # Only indent lines that don't have special formatting text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) else: text = re.sub('\n{3,}', '\n\n', text) return text