コード例 #1
0
def fairseq_preprocess(dataset,
                       dict_path=None,
                       source_lang='complex',
                       target_lang='simple'):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}'
        with create_directory_or_skip(preprocessed_dir):
            # HACK
            for phase in PHASES:
                for language, new_language in zip(LANGUAGES,
                                                  [source_lang, target_lang]):
                    symlink_path = get_data_filepath(dataset, phase,
                                                     new_language)
                    if not symlink_path.exists():
                        symlink_path.symlink_to(
                            get_data_filepath(dataset, phase, language))
            trainpref = str(get_data_filepath(dataset, 'train',
                                              'dummy')).replace('.dummy', '')
            validpref = str(get_data_filepath(dataset, 'valid',
                                              'dummy')).replace('.dummy', '')
            testpref = str(get_data_filepath(dataset, 'test',
                                             'dummy')).replace('.dummy', '')
            args = f'''
                --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref}
                --destdir {preprocessed_dir} --bpe sentencepiece
                --joined-dictionary --workers 32
            '''
            if dict_path is not None:
                args = f'{args} --srcdict {dict_path}'
            args = remove_multiple_whitespaces(args.replace('\n',
                                                            ' ')).strip(' ')
            print(f'fairseq-preprocess {args}')
            args = shlex.split(args)
            with mock_cli_args(args):
                preprocess.cli_main()
        return preprocessed_dir
コード例 #2
0
ファイル: preprocess.py プロジェクト: Fei-WL/CCMT
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead.
"""

from fairseq_cli.preprocess import cli_main

if __name__ == "__main__":
    cli_main()
コード例 #3
0
ファイル: preprocess.py プロジェクト: teanakamura/fairseq
import re
import sys
from os import path
fairseq_path = path.abspath(path.join(path.abspath(__file__), '../../fairseq'))
sys.path.insert(0, fairseq_path)
from fairseq_cli.preprocess import cli_main

if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
    sys.exit(cli_main())