def fairseq_preprocess(dataset, dict_path=None, source_lang='complex', target_lang='simple'): dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}' with create_directory_or_skip(preprocessed_dir): # HACK for phase in PHASES: for language, new_language in zip(LANGUAGES, [source_lang, target_lang]): symlink_path = get_data_filepath(dataset, phase, new_language) if not symlink_path.exists(): symlink_path.symlink_to( get_data_filepath(dataset, phase, language)) trainpref = str(get_data_filepath(dataset, 'train', 'dummy')).replace('.dummy', '') validpref = str(get_data_filepath(dataset, 'valid', 'dummy')).replace('.dummy', '') testpref = str(get_data_filepath(dataset, 'test', 'dummy')).replace('.dummy', '') args = f''' --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --bpe sentencepiece --joined-dictionary --workers 32 ''' if dict_path is not None: args = f'{args} --srcdict {dict_path}' args = remove_multiple_whitespaces(args.replace('\n', ' ')).strip(' ') print(f'fairseq-preprocess {args}') args = shlex.split(args) with mock_cli_args(args): preprocess.cli_main() return preprocessed_dir
#!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead. """ from fairseq_cli.preprocess import cli_main if __name__ == "__main__": cli_main()
import re import sys from os import path fairseq_path = path.abspath(path.join(path.abspath(__file__), '../../fairseq')) sys.path.insert(0, fairseq_path) from fairseq_cli.preprocess import cli_main if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) sys.exit(cli_main())