def __compile_expressions(self): SIMPLE_RPL = { "\\\\": "\\backslash ", "\\~": "\\~ ", "\\;": "\\; ", "&": "&", "<": "<", ">": ">", "\\~": "\\~ ", "\\_": "\\_ ", "\\:": "\\: ", "\\-": "\\- ", # turn into a generic token to eliminate special # cases and make processing easier "\\{": "\\ob ", # turn into a generic token to eliminate special # cases and make processing easier "\\}": "\\cb ", # put a backslash in front of to eliminate special cases and # make processing easier "{": "\\{", # put a backslash in front of to eliminate special cases and # make processing easier "}": "\\}", } self.__replace_spchar = MReplace(SIMPLE_RPL) # add ;? in case of char following \u self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") # manage upr/ud situations self.__utf_ud = re.compile( r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}") # add \n in split for whole file reading # why keep backslash whereas \is replaced before? # remove \n from endline char self.__splitexp = re.compile( r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") # this is for old RTF self.__par_exp = re.compile(r'(\\\n+|\\ )') # handle improper cs char-style with \* before without { self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') # handle cw using a digit as argument and without space as delimiter self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
_mreplace = MReplace({ '–': '--', '–': '--', '–': '--', '—': '---', '—': '---', '—': '---', '…': '...', '…': '...', '…': '...', '“': '"', '”': '"', '„': '"', '″': '"', '“': '"', '”': '"', '„': '"', '″': '"', '“':'"', '”':'"', '„':'"', '″':'"', '‘':"'", '’':"'", '′':"'", '‘':"'", '’':"'", '′':"'", '‘':"'", '’':"'", '′':"'", })
def __call__(self, html, remove_special_chars=None, get_preprocess_html=False): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\0', '') is_pdftohtml = self.is_pdftohtml(html) if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif is_pdftohtml: rules = self.PDFTOHTML else: rules = [] start_rules = [] if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) user_sr_rules = {} # Function for processing search and replace def do_search_replace(search_pattern, replace_txt): from calibre.ebooks.conversion.search_replace import compile_regular_expression try: search_re = compile_regular_expression(search_pattern) if not replace_txt: replace_txt = '' rules.insert(0, (search_re, replace_txt)) user_sr_rules[(search_re, replace_txt)] = search_pattern except Exception as e: self.log.error('Failed to parse %r regexp because %s' % (search, as_unicode(e))) # search / replace using the sr?_search / sr?_replace options for i in range(1, 4): search, replace = 'sr%d_search'%i, 'sr%d_replace'%i search_pattern = getattr(self.extra_opts, search, '') replace_txt = getattr(self.extra_opts, replace, '') if search_pattern: do_search_replace(search_pattern, replace_txt) # multi-search / replace using the search_replace option search_replace = getattr(self.extra_opts, 'search_replace', None) if search_replace: search_replace = json.loads(search_replace) for search_pattern, replace_txt in reversed(search_replace): do_search_replace(search_pattern, replace_txt) end_rules = [] # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: # print "The pdf line length returned is " + str(length) # unwrap em/en dashes end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa ) for rule in self.PREPROCESS + start_rules: html = rule[0].sub(rule[1], html) if self.regex_wizard_callback is not None: self.regex_wizard_callback(self.current_href, html) if get_preprocess_html: return html def dump(raw, where): import os dp = getattr(self.extra_opts, 'debug_pipeline', None) if dp and os.path.exists(dp): odir = os.path.join(dp, 'input') if os.path.exists(odir): odir = os.path.join(odir, where) if not os.path.exists(odir): os.makedirs(odir) name, i = None, 0 while not name or os.path.exists(os.path.join(odir, name)): i += 1 name = '%04d.html'%i with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) # dump(html, 'pre-preprocess') for rule in rules + end_rules: try: html = rule[0].sub(rule[1], html) except Exception as e: if rule in user_sr_rules: self.log.error( 'User supplied search & replace rule: %s -> %s ' 'failed with error: %s, ignoring.'%( user_sr_rules[rule], rule[1], e)) else: raise if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html', length) if is_pdftohtml: from calibre.ebooks.conversion.utils import HeuristicProcessor pdf_markup = HeuristicProcessor(self.extra_opts, None) totalwords = 0 if pdf_markup.get_word_count(html) > 7000: html = pdf_markup.markup_chapters(html, totalwords, True) # dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in html and SVG_NS not in html: html = html.replace( '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) if 'xlink:' in html and XLINK_NS not in html: html = html.replace( '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) html = XMLDECL_RE.sub('', html) if getattr(self.extra_opts, 'asciiize', False): from calibre.utils.localization import get_udc from calibre.utils.mreplace import MReplace unihandecoder = get_udc() mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3}) html = mr.mreplace(html) html = unihandecoder.decode(html) if getattr(self.extra_opts, 'enable_heuristics', False): from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(self.extra_opts, self.log) html = preprocessor(html) if is_pdftohtml: html = html.replace('<!-- created by calibre\'s pdftohtml -->', '') if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars except AttributeError: unsupported_unicode_chars = u'' if unsupported_unicode_chars: from calibre.utils.localization import get_udc unihandecoder = get_udc() for char in unsupported_unicode_chars: asciichar = unihandecoder.decode(char) html = html.replace(char, asciichar) return html