def register(self, unicode_text, latex_text, mode='text', package=None, decode=True, encode=True): """Register a correspondence between *unicode_text* and *latex_text*. :param str unicode_text: A unicode character. :param bytes latex_text: Its corresponding LaTeX translation. :param str mode: LaTeX mode in which the translation applies (``'text'`` or ``'math'``). :param str package: LaTeX package requirements (currently ignored). :param bool decode: Whether this translation applies to decoding (default: ``True``). :param bool encode: Whether this translation applies to encoding (default: ``True``). """ if package is not None: # TODO implement packages pass if mode == 'math': # also register text version self.register(unicode_text, b'$' + latex_text + b'$', mode='text', package=package, decode=decode, encode=encode) # XXX for the time being, we do not perform in-math substitutions return # tokenize, and register unicode translation self.lexer.reset() self.lexer.state = 'M' tokens = tuple(self.lexer.get_tokens(latex_text, final=True)) if decode: if tokens not in self.unicode_map: self.max_length = max(self.max_length, len(tokens)) self.unicode_map[tokens] = unicode_text # also register token variant with brackets, if appropriate # for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc. # note: we do not remove brackets (they sometimes matter, # e.g. bibtex uses them to prevent lower case transformation) if (len(tokens) == 2 and tokens[0].name.startswith('control') and tokens[1].name == 'chars'): alt_tokens = ( tokens[0], lexer.Token('chars', b'{'), tokens[1], lexer.Token('chars', b'}'), ) if alt_tokens not in self.unicode_map: self.max_length = max(self.max_length, len(alt_tokens)) self.unicode_map[alt_tokens] = u"{" + unicode_text + u"}" if encode and unicode_text not in self.latex_map: assert len(unicode_text) == 1 self.latex_map[unicode_text] = (latex_text, tokens)
def _get_latex_bytes_tokens_from_char(self, c): # if ascii, try latex equivalents # (this covers \, #, &, and other special LaTeX characters) if ord(c) < 128: try: return self.table.latex_map[c] except KeyError: pass # next, try input encoding try: bytes_ = c.encode(self.inputenc, 'strict') except UnicodeEncodeError: pass else: if self.binary_mode: return bytes_, (lexer.Token(name='chars', text=bytes_),) else: return c, (lexer.Token(name='chars', text=c),) # next, try latex equivalents of common unicode characters try: return self.table.latex_map[c] except KeyError: # translation failed if self.errors == 'strict': raise UnicodeEncodeError( "latex", # codec c, # problematic input 0, 1, # location of problematic character "don't know how to translate {0} into latex" .format(repr(c))) elif self.errors == 'ignore': return self.emptychar, (self.emptytoken,) elif self.errors == 'replace': # use the \\char command # this assumes # \usepackage[T1]{fontenc} # \usepackage[utf8]{inputenc} if self.binary_mode: bytes_ = b'{\\char' + str(ord(c)).encode("ascii") + b'}' else: bytes_ = u'{\\char' + str(ord(c)) + u'}' return bytes_, (lexer.Token(name='chars', text=bytes_),) elif self.errors == 'keep' and not self.binary_mode: return c, (lexer.Token(name='chars', text=c),) else: raise ValueError( "latex codec does not support {0} errors" .format(self.errors))
class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder): """Translating incremental encoder for latex. Maintains a state to determine whether control spaces etc. need to be inserted. """ emptytoken = lexer.Token(u"unknown", u"") """The empty token.""" table = _LATEX_UNICODE_TABLE """Translation table.""" def __init__(self, errors='strict'): super(LatexIncrementalEncoder, self).__init__(errors=errors) self.reset() def reset(self): super(LatexIncrementalEncoder, self).reset() self.state = 'M' def get_space_bytes(self, bytes_): """Inserts space bytes in space eating mode.""" if self.state == 'S': # in space eating mode # control space needed? if bytes_.startswith(u' '): # replace by control space return u'\\ ', bytes_[1:] else: # insert space (it is eaten, but needed for separation) return u' ', bytes_ else: return u'', bytes_ def _get_latex_chars_tokens_from_char(self, c): # if ascii, try latex equivalents # (this covers \, #, &, and other special LaTeX characters) if ord(c) < 128: try: return self.table.latex_map[c] except KeyError: pass # next, try input encoding try: bytes_ = c.encode(self.inputenc, 'strict') except UnicodeEncodeError: pass else: return c, (lexer.Token(name=u'chars', text=c), ) # next, try latex equivalents of common unicode characters try: return self.table.latex_map[c] except KeyError: # translation failed if self.errors == 'strict': raise UnicodeEncodeError( "latex", # codec c, # problematic input 0, 1, # location of problematic character "don't know how to translate {0} into latex".format( repr(c))) elif self.errors == 'ignore': return u'', (self.emptytoken, ) elif self.errors == 'replace': # use the \\char command # this assumes # \usepackage[T1]{fontenc} # \usepackage[utf8]{inputenc} bytes_ = u'{\\char' + str(ord(c)) + u'}' return bytes_, (lexer.Token(name=u'chars', text=bytes_), ) elif self.errors == 'keep' and not self.binary_mode: return c, (lexer.Token(name=u'chars', text=c), ) else: raise ValueError( "latex codec does not support {0} errors".format( self.errors)) def get_latex_chars(self, unicode_, final=False): if not isinstance(unicode_, string_types): raise TypeError( "expected unicode for encode input, but got {0} instead". format(unicode_.__class__.__name__)) # convert character by character for pos, c in enumerate(unicode_): bytes_, tokens = self._get_latex_chars_tokens_from_char(c) space, bytes_ = self.get_space_bytes(bytes_) # update state if tokens[-1].name == u'control_word': # we're eating spaces self.state = 'S' else: self.state = 'M' if space: yield space yield bytes_