Example #1
0
    def itertokens(self):
        """read tex tokens, including imported files"""
        dirname = os.path.dirname(self.filename)
        tex = TeX(file=self.filename)
        tokens = tex.itertokens()

        try:
            while True:
                token = next(tokens)
                if token.data == 'input':
                    tokens = self.add_input(dirname, tokens)
                elif token.data == 'import':
                    # TODO handle \subimport, and also \import* and \subimport*
                    print("WARNING: we don't handle \\import yet")
                    yield token
                elif token.data == 'include':
                    # TODO be aware of \includeonly
                    tokens = self.add_input(dirname, tokens)
                elif token.data == 'newcommand':
                    try:
                        name = read_macro_name(tokens)
                        args_or_def = read_balanced_brackets(tokens)
                        if args_or_def[0] == '[':  # n_args
                            n_args = format_n_args(args_or_def)
                            definition = read_balanced_brackets(tokens)
                        else:
                            n_args = 0
                            definition = args_or_def
                        self.macro_lut[name] = MacroDef(
                            n_args, definition[1:-1])
                    except:
                        yield token
                else:
                    for t in maybe_expand_macro(token, tokens, self.macro_lut):
                        yield t
        except StopIteration:
            pass
import sys, re
from plasTeX.TeX import TeX
from utils import pre_tokenize, post_tokenize, BASIC_SKELETON

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print >> sys.stderr, 'Usage: python %s <tabulars> <tokenized-tabulars>' % sys.argv[
            0]
        sys.exit(1)
    with open(sys.argv[1]) as fin:
        with open(sys.argv[2], 'w') as fout:
            idx = 0
            for line in fin:
                _, tabular = line.split('\t', 1)
                idx += 1
                #if idx != 11:
                #    continue
                tex = TeX()
                tex.input(BASIC_SKELETON % (pre_tokenize(tabular)))
                #print (pre_tokenize(tabular))
                tokens = [token for token in tex.itertokens()]
                tokens_out = []
                #jprint (':'.join(tokens))
                tokens_out = post_tokenize(tokens)
                #print (':'.join(tokens_out))
                fout.write(' '.join(tokens_out)[385:-86] + '\n')