def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) # A bit redundant for now, but makes adding new tokenizers easier in # future tokenize_fn = simple_word_tokenize # Open files (or just use stdin and stdout) fin, fout = open_files(arguments['FILE'], arguments['--output']) # Tokenize lines try: for line in fin: fout.write(' '.join(tokenize_fn(line))) fout.write('\n') # If everything worked so far, this shouldn't happen except Exception: # pylint: disable=W0703 sys.stderr.write('Error: An unkown error occured during ' 'tokenization.\n') sys.exit(1) # Cleanup if arguments['FILE'] is not None: fin.close() if arguments['--output'] is not None: fout.close() sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments['--list']: for scheme in _BUILTIN_SCHEMES: print("{} {}".format(scheme[0].ljust(8), scheme[1])) sys.exit(0) dediac_fn = None for scheme in _BUILTIN_SCHEMES: if scheme[0] == arguments['--scheme']: dediac_fn = scheme[2] if dediac_fn is None: sys.stderr.write('Error: {} is not a valid scheme.\n' 'Run `camel_dediac -l` to see the list' ' of available schemes.' '\n'.format(repr(arguments['--scheme']))) sys.exit(1) strip_markers = arguments['--strip-markers'] marker = arguments['--marker'] ignore_markers = arguments['--ignore-markers'] # Open files (or just use stdin and stdout) fin, fout = open_files(arguments['FILE'], arguments['--output']) # De-diacritize lines try: if ignore_markers: for line in fin: toks = _WHITESPACE_RE.findall(line) dediac_toks = _dediac_marked_tokens(toks, dediac_fn, marker, strip_markers) fout.write(''.join(dediac_toks)) else: for line in fin: fout.write(dediac_fn(line)) # If everything worked so far, this shouldn't happen except Exception: # pylint: disable=W0703 sys.stderr.write('Error: An unkown error occured during ' 'de-diacritization.\n') sys.exit(1) # Cleanup if arguments['FILE'] is not None: fin.close() if arguments['--output'] is not None: fout.close() sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def main(): try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments['--list']: for db_entry in _BUILTIN_DBS: print("{} {}".format(db_entry.name.ljust(8), db_entry.description)) sys.exit(0) db_name = None for db_entry in _BUILTIN_DBS: if arguments['--db'] == db_entry.name: db_name = db_entry.name if db_name is None: sys.stderr.write('Error: {} is not a valid database name.\n' 'Run `camel_diac -l` to see the list of available' ' databases.\n'.format(repr(arguments['--db']))) sys.exit(1) disambig = MLEDisambiguator.pretrained(db_name) marker = arguments['--marker'] ignore_markers = arguments['--ignore-markers'] strip_markers = arguments['--strip-markers'] pretokenized = arguments['--pretokenized'] # Open files (or just use stdin and stdout) fin, fout = open_files(arguments['FILE'], arguments['--output']) # Diacritize lines try: for line in fin: toks = _WHITESPACE_RE.findall(line) diac_toks = _diac_tokens(toks, disambig, ignore_markers, marker, strip_markers, pretokenized) fout.write(''.join(diac_toks)) # If everything worked so far, this shouldn't happen except Exception: # pylint: disable=W0703 sys.stderr.write('Error: An unkown error occured during ' 'diacritization.\n') sys.exit(1) # Cleanup if arguments['FILE'] is not None: fin.close() if arguments['--output'] is not None: fout.close() sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)