def tokens(self, text): xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags) xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler) text = ''.join(xhtmlsaxhandler.text) #_inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' #inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) #inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken( XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) #for m in inlcodes: # # XXX break line in HTML is coded with paragraph styling and this linearizing # # does not help # tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1)) # tokentext = core.InlCodeToken.linearize(tokentext) # token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) # tokens.append(token) for m in blkcodes: tokentext = m.group(1).lstrip('\n').rstrip('\n ') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken( XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): oosaxhandler = OOSaxHandler(style=self.style) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, oosaxhandler) text = ''.join(oosaxhandler.text) _inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: # XXX break line in OO is coded with paragraph styling and this linearizing # does not help tokentext = OOSaxHandler._decode_spec_chars(m.group(1)) tokentext = core.InlCodeToken.linearize(tokentext) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: token = core.BlkCodeToken( OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) #print [t.text for t in tokens] return tokens
def tokens(self, text): """Returns tokens >>> text = \ 'Example of Literate Programming in Markdown\\n' \ '===========================================\\n\\n' \ \ 'Code 1\\n' \ '------\\n\\n' \ \ 'Test if variable is negative looks like <<c.isneg>>: `if a < 0`.\\n' \ 'So, we can write absolute function <<c.fun>>:\\n\\n' \ \ ' def fun(x):\\n' \ ' <<=c.isneg,a:v>>:\\n' \ ' a += 100\\n' \ ' return -a\\n\\n' \ \ 'And <<c.sum>>:\\n\\n' \ \ ' def sum(x, y):\\n' \ ' return x+y\\n\\n' \ \ 'not code\\n' \ 'not code\\n\\n' \ \ 'Lalalalalalal\\n' \ 'Lalalalalalal\\n' >>> p = MDParser() >>> toks = p.tokens(text) >>> [tok.__class__.__name__ for tok in toks] ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken'] >>> toks[0].text 'c.isneg' >>> toks[1].text 'if a < 0' >>> toks[2].text 'c.fun' >>> toks[3].text.startswith('def fun') True >>> toks[3].text.endswith('return -a') True """ # left padding fragment of code-block indents = (4 * ' ', '\t') # possible padding indents = '|'.join(re.escape(s) for s in indents) _inlcode_re = r'`([^`]+)`' _blkcode_re = r'^\n(?P<code>((%s)(.*?)\n|\n)+)$' % indents _blkcode_lstrip_re = '^(%s)' % indents inlcode_re = re.compile(_inlcode_re) blkcode_re = re.compile(_blkcode_re, re.MULTILINE) blkcode_lstrip_re = re.compile(_blkcode_lstrip_re, re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(m.group(1), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: tokentext = core.InlCodeToken.linearize(m.group(1)) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: # find each block and replace first left indent with '' tokentext = m.group('code').strip('\n') tokentext = blkcode_lstrip_re.sub('', tokentext) token = core.BlkCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): """Returns tokens >>> text = \ '\\\\documentclass[12pt]{article}\\n' \ '\\\\usepackage{amsmath}\\n' \ '\\\\title{\\\\LaTeX}\\n' \ '\\\\date{}\\n' \ '\\\\begin{document}\\n' \ ' \\\\maketitle\\n' \ ' \\\\LaTeX{} is a document preparation system for the \\\\TeX{}\\n' \ '\\n' \ ' Testing of negative value <<isneg>>: \\\\verb#if a < 0#. Signature will\\n' \ ' be <<fn.abs.decl>>: \\\\verb!int abs(int a)!. And now:\\n' \ ' function absolute <<fn.abs>>:\\n' \ '\\n' \ ' \\\\begin{verbatim}\\n' \ ' <<=fn.abs.decl>>\\n' \ ' if (<<=isneg, x:a>>) return -a;\\n' \ ' else return a;\\n' \ ' }\\n' \ ' \\\\end{verbatim}\\n' \ '\\n' \ ' % This is a comment; it will not be shown in the final output.\\n' \ ' % The following shows a little of the typesetting power of LaTeX:\\n' \ ' \\\\begin{align}\\n' \ ' m &= \\\\frac{m_0}{\\\\sqrt{1-\\\\frac{v^2}{c^2}}}\\n' \ ' \\\\end{align}\\n' \ '\\\\end{document}' >>> p = TeXParser() >>> toks = p.tokens(text) >>> [tok.__class__.__name__ for tok in toks] ['CmdToken', 'InlCodeToken', 'CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken'] >>> toks[0].text 'isneg' >>> toks[1].text 'if a < 0' >>> toks[2].text 'fn.abs.decl' >>> toks[3].text == 'int abs(int a)' True >>> toks[5].text.startswith('<<=fn.abs.decl>>') True >>> toks[5].text.endswith('}') True """ _inlcmds = '|'.join(re.escape(s) for s in TeXParser.inlcmds) # text inside inline chunk _inltext = r'{(?P<code1>.+?)}|(?P<s>[+#!])(?P<code2>.+?)(?P=s)' _blkcmds = '|'.join(re.escape(s) for s in TeXParser.blkcmds) _inlcode_re = r'\\(%s)%s' % (_inlcmds, _inltext) _blkcode_re = r'\\begin(\[.+?\])?{(%s)}(?P<code>.*?)\\end{(%s)}' % ( _blkcmds, _blkcmds) inlcode_re = re.compile(_inlcode_re) blkcode_re = re.compile(_blkcode_re, re.DOTALL) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(m.group(1), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: groups = m.groupdict() g = 'code1' if groups['code1'] else 'code2' tokentext = core.InlCodeToken.linearize(m.group(g)) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: # find each block and replace first left indent with '' tokentext = m.group('code').lstrip('\n').rstrip('\n ') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): """Returns tokens >>> text = \ 'Example of Literate Programming in Asciidoc\\n' \ '===========================================\\n\\n' \ \ 'Code 1\\n' \ '------\\n\\n' \ \ 'Test if variable is negative looks like <<c.isneg>>: +if a < 0+.\\n' \ 'So, we can write absolute function <<c.fun>>\\n' \ '[source, python]\\n' \ '----\\n' \ ' def fun(x):\\n' \ ' <<=c.isneg,a:v>>:\\n' \ ' a += 100\\n' \ ' return -a\\n\\n' \ '----\\n\\n' \ 'And <<c.sum>>\\n\\n' \ '----\\n' \ ' def sum(x, y):\\n\\n' \ \ ' return x+y\\n\\n' \ ' x += 1\\n' \ '----' >>> p = AsciiDocParser() >>> toks = p.tokens(text) >>> [tok.__class__.__name__ for tok in toks] ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken'] >>> toks[0].text 'c.isneg' >>> toks[1].text 'if a < 0' >>> toks[2].text 'c.fun' >>> toks[3].text.startswith('def fun') True >>> toks[3].text.endswith(' return -a') True >>> toks[5].text.startswith('def sum') True >>> toks[5].text.endswith('x += 1') True """ _inlcode_re = r'\+([^\n\+]+)\+' _blkcode_re = r'(?:\[source[^]]*?]\n|\n\n)-{4,}(?P<code>.*?)\n-{4,}' inlcode_re = re.compile(_inlcode_re) blkcode_re = re.compile(_blkcode_re, re.DOTALL) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(m.group(1), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: tokentext = core.InlCodeToken.linearize(m.group(1)) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: # find each block and replace first left indent with '' tokentext = m.group('code').strip('\n') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): """Returns tokens >>> text = \ '= Example of Literate Programming in Txt2Tags =\\n' \ \ '== Code 1 ==\\n' \ \ 'Test if variable is negative looks like <<c.isneg>>: ``if a < 0``.\\n' \ 'So, we can write absolute function <<c.fun>>\\n' \ '```\\n' \ ' def fun(x):\\n' \ ' <<=c.isneg,a:v>>:\\n' \ ' a += 100\\n' \ ' return -a\\n\\n' \ '```\\n' \ 'And <<c.sum>>\\n' \ '```\\n' \ ' def sum(x, y):\\n\\n' \ \ ' return x+y\\n\\n' \ ' x += 1\\n' \ '```\\n' \ 'Last chunk is <<c.run>>\\n' \ '``` $ ls -F1' >>> p = Txt2TagsParser() >>> toks = p.tokens(text) >>> [tok.__class__.__name__ for tok in toks] ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken'] >>> toks[0].text 'c.isneg' >>> toks[1].text 'if a < 0' >>> toks[2].text 'c.fun' >>> toks[3].text.startswith('def fun') True >>> toks[3].text.endswith(' return -a') True >>> toks[5].text.startswith('def sum') True >>> toks[5].text.endswith('x += 1') True >>> toks[7].text '$ ls -F1' """ _inlcode_re = r'``([^\n`]+)``' _blkcode_re1 = r'(?:\n|^)```(?P<code>[^\n]+)(?:\n|$)' _blkcode_re2 = r'(?:\n|^)```\n+(?P<code>.*?)\n```(?:\n|$)' inlcode_re = re.compile(_inlcode_re) blkcode_re1 = re.compile(_blkcode_re1, re.DOTALL) blkcode_re2 = re.compile(_blkcode_re2, re.DOTALL) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = itertools.chain(blkcode_re1.finditer(text), blkcode_re2.finditer(text)) tokens = [] for m in cmds: token = core.CmdToken(m.group(1), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: tokentext = core.InlCodeToken.linearize(m.group(1)) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: # find each block and replace first left indent with '' tokentext = m.group('code').strip('\n') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): """Returns tokens >>> text = \ '= Example of Literate Programming in Creole =\\n' \ \ '== Code 1 ==\\n' \ \ 'Test if variable is negative looks like <<c.isneg>>: {{{if a < 0}}}.\\n' \ 'So, we can write absolute function <<c.fun>>:\\n\\n' \ '{{{\\n' \ ' def fun(x):\\n' \ ' <<=c.isneg,a:v>>:\\n' \ ' a += 100\\n' \ ' return -a }}}\\n\\n' \ \ 'And <<c.sum>>:\\n\\n' \ \ '{{{ def sum(x, y):\\n' \ ' return x+y }}}\\n\\n' \ \ 'not code\\n' \ 'not code\\n\\n' \ \ 'Lalalalalalal\\n' \ 'Lalalalalalal\\n' >>> p = CreoleParser() >>> toks = p.tokens(text) >>> [tok.__class__.__name__ for tok in toks] ['CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken'] >>> toks[0].text 'c.isneg' >>> toks[1].text 'if a < 0' >>> toks[2].text 'c.fun' >>> toks[3].text.startswith(' def fun') True >>> toks[3].text.endswith('return -a') True """ # XXX Only block chunks, inline and block Creole chunks are the same. # In real Creole distinguish block|inline chunks, but it's not # valuable for LP _blkcode_re = r'{{{\n*(?P<code>.*?)[\ \n]*}}}' blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(m.group(1), m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: # find each block and replace first left indent with '' tokentext = m.group('code') token = core.BlkCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens