コード例 #1
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        # A bit redundant for now, but makes adding new tokenizers easier in
        # future
        tokenize_fn = simple_word_tokenize

        # Open files (or just use stdin and stdout)
        fin, fout = open_files(arguments['FILE'], arguments['--output'])

        # Tokenize lines
        try:
            for line in fin:
                fout.write(' '.join(tokenize_fn(line)))
                fout.write('\n')

        # If everything worked so far, this shouldn't happen
        except Exception:  # pylint: disable=W0703
            sys.stderr.write('Error: An unkown error occured during '
                             'tokenization.\n')
            sys.exit(1)

        # Cleanup
        if arguments['FILE'] is not None:
            fin.close()
        if arguments['--output'] is not None:
            fout.close()

        sys.exit(0)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)

    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
コード例 #2
0
ファイル: camel_dediac.py プロジェクト: slkh/camel_tools
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for scheme in _BUILTIN_SCHEMES:
                print("{}   {}".format(scheme[0].ljust(8), scheme[1]))
            sys.exit(0)

        dediac_fn = None

        for scheme in _BUILTIN_SCHEMES:
            if scheme[0] == arguments['--scheme']:
                dediac_fn = scheme[2]

        if dediac_fn is None:
            sys.stderr.write('Error: {} is not a valid scheme.\n'
                                'Run `camel_dediac -l` to see the list'
                                ' of available schemes.'
                                '\n'.format(repr(arguments['--scheme'])))
            sys.exit(1)

        strip_markers = arguments['--strip-markers']
        marker = arguments['--marker']
        ignore_markers = arguments['--ignore-markers']

        # Open files (or just use stdin and stdout)
        fin, fout = open_files(arguments['FILE'], arguments['--output'])

        # De-diacritize lines
        try:
            if ignore_markers:
                for line in fin:
                    toks = _WHITESPACE_RE.findall(line)
                    dediac_toks = _dediac_marked_tokens(toks, dediac_fn,
                                                        marker, strip_markers)
                    fout.write(''.join(dediac_toks))
            else:
                for line in fin:
                    fout.write(dediac_fn(line))

        # If everything worked so far, this shouldn't happen
        except Exception:  # pylint: disable=W0703
            sys.stderr.write('Error: An unkown error occured during '
                                'de-diacritization.\n')
            sys.exit(1)

        # Cleanup
        if arguments['FILE'] is not None:
            fin.close()
        if arguments['--output'] is not None:
            fout.close()

        sys.exit(0)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)

    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
コード例 #3
0
def main():
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for db_entry in _BUILTIN_DBS:
                print("{}   {}".format(db_entry.name.ljust(8),
                                       db_entry.description))
            sys.exit(0)

        db_name = None

        for db_entry in _BUILTIN_DBS:
            if arguments['--db'] == db_entry.name:
                db_name = db_entry.name

        if db_name is None:
            sys.stderr.write('Error: {} is not a valid database name.\n'
                             'Run `camel_diac -l` to see the list of available'
                             ' databases.\n'.format(repr(arguments['--db'])))
            sys.exit(1)

        disambig = MLEDisambiguator.pretrained(db_name)

        marker = arguments['--marker']
        ignore_markers = arguments['--ignore-markers']
        strip_markers = arguments['--strip-markers']
        pretokenized = arguments['--pretokenized']

        # Open files (or just use stdin and stdout)
        fin, fout = open_files(arguments['FILE'], arguments['--output'])

        # Diacritize lines
        try:
            for line in fin:
                toks = _WHITESPACE_RE.findall(line)
                diac_toks = _diac_tokens(toks, disambig, ignore_markers,
                                         marker, strip_markers, pretokenized)
                fout.write(''.join(diac_toks))

        # If everything worked so far, this shouldn't happen
        except Exception:  # pylint: disable=W0703
            sys.stderr.write('Error: An unkown error occured during '
                             'diacritization.\n')
            sys.exit(1)

        # Cleanup
        if arguments['FILE'] is not None:
            fin.close()
        if arguments['--output'] is not None:
            fout.close()

        sys.exit(0)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)

    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)