def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='C53 xml to text', input_description='XML files (via antiword)', glob='*.xml') psr = iodir_argparser(cfg) args = psr.parse_args() generic_main(cfg, convert, args)
def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='Fine Rolls xml to text', input_description='XML files (manually annotated TEI)', glob='roll*.xml') psr = iodir_argparser(cfg) args = psr.parse_args() generic_main(cfg, convert, args)
def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='TTT petitions converter', input_description='.dat files', glob='*.dat') psr = iodir_argparser(cfg) args = psr.parse_args() generic_main(cfg, convert, args)
def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='state papers to text', input_description='XML files', glob='*.xml') psr = iodir_argparser(cfg) args = psr.parse_args() generic_main(cfg, _do_file, args)
def main(): "read cli args, loop on dir" cfg = CliConfig(description='annotations converter', input_description='annotated text files', glob='*') psr = iodir_argparser(cfg) psr.add_argument('--format', choices=['human', 'gate'], default='human', help='input markup format') args = psr.parse_args() generic_main(cfg, mk_converter(args.format), args)
def main(): "read cli args, loop on dir" cfg = CliConfig(description='crude annotations viewer', input_description='annotation json', glob='*') psr = iodir_argparser(cfg) args = psr.parse_args() output_dir = args.output for root, _, files in os.walk(args.input): root_subpath = fp.relpath(root, args.input) oroot = fp.join(output_dir, root_subpath) if not fp.exists(oroot): os.makedirs(oroot) for bname in files: save_occurrences(root, oroot, bname)
def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='TTT petitions metadata extractor', input_description='.dat files', glob='*.dat') psr = iodir_argparser(cfg) args = psr.parse_args() values = defaultdict(set) for filename in glob.glob(fp.join(args.input, cfg.glob)): with codecs.open(filename, 'r', 'iso8859-1') as stream: content = stream.read() for key, val in _TAG_RE.findall(content): values[key].add(val) for key, val in values.items(): ofile = fp.join(args.output, key) with codecs.open(ofile, 'w', 'utf-8') as stream: print("\n".join(sorted(val)), file=stream)
def main(): """ Read input dir, dump in output dir """ cfg = CliConfig(description='Fine Rolls xml to text', input_description='XML files (manually annotated TEI)', glob='roll*.xml') psr = iodir_argparser(cfg) args = psr.parse_args() values = collections.defaultdict(set) for ifile in glob.glob(fp.join(args.input, cfg.glob)): tree = ET.parse(ifile) for tag in _TAGS: for node in tree.iter(tag): txt = node.text or '' txt = ' '.join(txt.split()) values[tag].add(txt) if not fp.exists(args.output): os.makedirs(args.output) for key, vals in values.items(): ofile = fp.join(args.output, key) with codecs.open(ofile, 'w', 'utf-8') as stream: print("\n".join(sorted(vals)), file=stream)