def translate(parser): opts = parser.parse_args() logger = logging.getLogger(__name__) # read input input_root, input_extension = os.path.splitext(opts.input) if input_extension == '.docx': logger.info('Reading from filepath: %s', opts.input) from xdoc.formats.docx import read with open(opts.input) as fp: document = read(fp) else: raise NotImplementedError('File extension "%s" not supported as input' % input_extension) # write output output_root, output_extension = os.path.splitext(opts.output) if output_extension == '.tex': tex_filepath = output_root + '.tex' bib_filepath = output_root + '.bib' logger.info('Writing to filepaths: %s & %s', tex_filepath, bib_filepath) from xdoc.formats.tex import write with open(tex_filepath, 'w') as tex_fp: with open(bib_filepath, 'w') as bib_fp: write(tex_fp, bib_fp, document) else: raise NotImplementedError('File extension "%s" not supported as output' % output_extension)
def main(): parser = argparse.ArgumentParser( description='Usage: xdoc original.docx converted.tex', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('input', help='input filename') # parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout) parser.add_argument('output', help='output filename') parser.add_argument('-a', '--action', choices=actions, default='translate', help='xdoc action') parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output') opts = parser.parse_args() logging.root.setLevel(logging.DEBUG if opts.verbose else logging.INFO) logger = logging.getLogger(__name__) logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level)) actions[opts.action](parser) logger.debug('Done')
def parsebib(parser): opts = parser.parse_args() logger = logging.getLogger(__name__) from xdoc.bibliography import crossref_lookup from xdoc.formats.tex import serialize_reference input = sys.stdin if (opts.input == '-') else open(opts.input) output = sys.stdout if (opts.output == '-') else open(opts.output) for line in input: line = line.strip().decode('utf8') logger.info('Resolving "%s" via CrossRef API', line) for bibitem in crossref_lookup(line): print >> output, serialize_reference(bibitem) break else: logger.error('FIXME: could not parse bib item: %s', line)
# a lot of this is from Tweedr import crfsuite import re from colorama import Fore import tempfile from xdoc.formats.tex import serialize_reference from xdoc.lib.text import utf8str from unidecode import unidecode from viz import gloss # from sklearn import linear_model, naive_bayes, neighbors, svm # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from xdoc.lib.log import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) xml_begin = re.compile('<(\w+)>') xml_end = re.compile('</(\w+)>') # tex_command = re.compile(r'\\[a-z]+\{([^\}]+)\}') def mean(xs): return sum(xs) / float(len(xs)) class ItemSequence(crfsuite.ItemSequence): def __init__(self, features_iter, check=False): '''Create new ItemSequence, typedef std::vector<Item> based on the given iterable of iterable of 2-tuples or strings. If check=True, any unicode present in the given features_iter will be encoded into a bytestring as utf8.'''